!13032 Add modules of Sponge

From: @zhangxinfeng3 Reviewed-by: @ljl0711,@wang_zi_dong Signed-off-by: @wang_zi_dong
5 years ago · 7583b258df
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -399,7 +399,8 @@ if(ENABLE_GPU)
                          ${CUDNN_LIBRARY_PATH}
                          ${CUDA_PATH}/lib64/libcudart.so
                          ${CUDA_PATH}/lib64/stubs/libcuda.so
                          ${CUDA_PATH}/lib64/libcusolver.so)
                          ${CUDA_PATH}/lib64/libcusolver.so
                          ${CUDA_PATH}/lib64/libcufft.so)
    if(ENABLE_MPI)
        set_target_properties(_ms_mpi PROPERTIES INSTALL_RPATH ${MINDSPORE_RPATH})
    endif()
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/getcenter_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/getcenter_impl.cu
@@ -0,0 +1,45 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common/getcenter_impl.cuh"

 __global__ void GetCenterOfGeometryKernel(const int center_numbers, float center_numbers_inverse,
                                          const int *center_atoms, const VECTOR *crd, VECTOR *center_of_geometry) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < center_numbers) {
    int atom_i = center_atoms[i];
    VECTOR temp = center_numbers_inverse * crd[atom_i];
    atomicAdd(&center_of_geometry[0].x, temp.x);
    atomicAdd(&center_of_geometry[0].y, temp.y);
    atomicAdd(&center_of_geometry[0].z, temp.z);
  }
 }

 void GetCenterOfGeometry(const int center_numbers, float center_numbers_inverse, const int *center_atoms,
                         const float *crd_f, float *center_of_geometry_f, cudaStream_t stream) {
  VECTOR *crd = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(crd_f));
  VECTOR *center_of_geometry = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(center_of_geometry_f));
  GetCenterOfGeometryKernel<<<ceilf(static_cast<float>(center_numbers) / 32), 32, 0, stream>>>(
    center_numbers, center_numbers_inverse, center_atoms, crd, center_of_geometry);

  cudaStreamSynchronize(stream);

  return;
 }

 void GetCenterOfGeometry(const int center_numbers, float center_numbers_inverse, const int *center_atoms, float *crd_f,
                         float *center_of_geometry_f, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/getcenter_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/getcenter_impl.cuh
@@ -0,0 +1,26 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_GETCENTER_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_GETCENTER_IMPL_H_

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void GetCenterOfGeometry(const int center_numbers, float center_numbers_inverse, const int *center_atoms,
                         const float *crd_f, float *center_of_geometry_f, cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_GETCENTER_IMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/mdtemperature_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/mdtemperature_impl.cu
@@ -0,0 +1,51 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common/mdtemperature_impl.cuh"

 __global__ void MDTemperatureKernel(const int residue_numbers, const int *start, const int *end, const VECTOR *atom_vel,
                                    const float *atom_mass, float *ek) {
  int residue_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (residue_i < residue_numbers) {
    VECTOR momentum = {0., 0., 0.};
    float res_mass = 0.;
    int s = start[residue_i];
    int e = end[residue_i];
    float mass_lin;
    for (int atom_i = s; atom_i < e; atom_i = atom_i + 1) {
      mass_lin = atom_mass[atom_i];

      momentum.x = momentum.x + mass_lin * atom_vel[atom_i].x;
      momentum.y = momentum.y + mass_lin * atom_vel[atom_i].y;
      momentum.z = momentum.z + mass_lin * atom_vel[atom_i].z;
      res_mass = res_mass + mass_lin;
    }
    ek[residue_i] = 0.5 * (momentum.x * momentum.x + momentum.y * momentum.y + momentum.z * momentum.z) / res_mass *
                    2. / 3. / CONSTANT_kB / residue_numbers;
  }
 }

 void MDTemperature(const int residue_numbers, const int *start, const int *end, const float *atom_vel_f,
                   const float *atom_mass, float *ek, cudaStream_t stream) {
  VECTOR *atom_vel = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(atom_vel_f));
  MDTemperatureKernel<<<ceilf(static_cast<float>(residue_numbers) / 32), 32, 0, stream>>>(residue_numbers, start, end,
                                                                                          atom_vel, atom_mass, ek);
  cudaStreamSynchronize(stream);

  return;
 }
 void MDTemperature(const int residue_numbers, const int *start, const int *end, const float *atom_vel_f,
                   const float *atom_mass, float *ek, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/mdtemperature_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/mdtemperature_impl.cuh
@@ -0,0 +1,25 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_MDTEMPERATURE_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_MDTEMPERATURE_IMPL_H_

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void MDTemperature(const int residue_numbers, const int *start, const int *end, const float *atom_vel_f,
                   const float *atom_mass, float *ek, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_MDTEMPERATURE_IMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh
@@ -14,31 +14,59 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SPONGE_COMMONHW_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SPONGE_COMMONHW_H_
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_SPONGE_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_SPONGE_H_

 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <time.h>

 #include <curand_kernel.h>

 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 #include <cufft.h>
 #include "runtime/device/gpu/cuda_common.h"

 #define CONSTANT_Pi 3.1415926535897932
 #define TWO_DIVIDED_BY_SQRT_PI 1.1283791670218446
 #define CONSTANT_kB 0.00198716
 static dim3 thread_LJ(8, 32);

 struct VECTOR {
  float x;
  float y;
  float z;
 };
 struct INT_VECTOR {
  int int_x;
  int int_y;
  int int_z;
 };
 struct UNSIGNED_INT_VECTOR {
  unsigned int uint_x;
  unsigned int uint_y;
  unsigned int uint_z;
 };
 struct NEIGHBOR_LIST {
  int atom_numbers;
  int *atom_serial;
 };
 struct UINT_VECTOR_LJ_TYPE {
  unsigned int uint_x;
  unsigned int uint_y;
  unsigned int uint_z;
  int LJ_type;
  float charge;
 };

 struct GRID_BUCKET {
  int *atom_serial;
 };
 struct GRID_POINTER {
  int *grid_serial;
 };
 __device__ __host__ static inline VECTOR Get_Periodic_Displacement(const UNSIGNED_INT_VECTOR uvec_a,
                                                                   const UNSIGNED_INT_VECTOR uvec_b,
                                                                   const VECTOR scaler) {
@@ -48,6 +76,15 @@ __device__ __host__ static inline VECTOR Get_Periodic_Displacement(const UNSIGNE
  dr.z = (static_cast<int>(uvec_a.uint_z - uvec_b.uint_z)) * scaler.z;
  return dr;
 }
 __device__ __host__ static inline VECTOR Get_Periodic_Displacement(const UINT_VECTOR_LJ_TYPE uvec_a,
                                                                   const UINT_VECTOR_LJ_TYPE uvec_b,
                                                                   const VECTOR scaler) {
  VECTOR dr;
  dr.x = (static_cast<int>(uvec_a.uint_x - uvec_b.uint_x)) * scaler.x;
  dr.y = (static_cast<int>(uvec_a.uint_y - uvec_b.uint_y)) * scaler.y;
  dr.z = (static_cast<int>(uvec_a.uint_z - uvec_b.uint_z)) * scaler.z;
  return dr;
 }

 __device__ __host__ static inline VECTOR operator+(const VECTOR &veca, const VECTOR &vecb) {
  VECTOR vec;
@@ -91,4 +128,124 @@ __device__ __host__ static inline VECTOR operator^(const VECTOR &veca, const VEC
  return vec;
 }

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_SPONGE_COMMON_H_
 __global__ static void construct_neighbor_list_kernel(int atom_numbers, int max_neighbor_numbers, int *nl_atom_numbers,
                                                      int *nl_atom_serial, NEIGHBOR_LIST *nl) {
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < atom_numbers; i += gridDim.x * blockDim.x) {
    nl[i].atom_numbers = nl_atom_numbers[i];
    nl[i].atom_serial = nl_atom_serial + i * max_neighbor_numbers;
  }
 }

 static inline bool Malloc_Safely(void **address, size_t size) {
  address[0] = NULL;
  address[0] = reinterpret_cast<void *>(malloc(size));
  if (address[0] != NULL) {
    return true;
  } else {
    printf("malloc failed!\n");
    getchar();
    return false;
  }
 }
 static inline bool Cuda_Malloc_Safely(void **address, size_t size) {
  cudaError_t cuda_error = cudaMalloc(&address[0], size);
  if (cuda_error == 0) {
    return true;
  } else {
    printf("cudaMalloc failed! error %d\n", cuda_error);
    getchar();
    return false;
  }
 }

 __global__ static void Copy_Crd_To_New_Crd_Start(const int atom_numbers, const UNSIGNED_INT_VECTOR *crd,
                                                 UINT_VECTOR_LJ_TYPE *new_crd, const int *LJ_type,
                                                 const float *charge) {
  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom_i < atom_numbers) {
    new_crd[atom_i].uint_x = crd[atom_i].uint_x;
    new_crd[atom_i].uint_y = crd[atom_i].uint_y;
    new_crd[atom_i].uint_z = crd[atom_i].uint_z;
    new_crd[atom_i].LJ_type = LJ_type[atom_i];
    new_crd[atom_i].charge = charge[atom_i];
  }
 }

 __global__ static void Rand_Normal(const int float4_numbers, curandStatePhilox4_32_10_t *rand_state,
                                   float4 *rand_float4) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < float4_numbers) {
    rand_float4[i] = curand_normal4(&rand_state[i]);
  }
 }

 __global__ static void Setup_Rand_Normal_Kernel(const int float4_numbers, curandStatePhilox4_32_10_t *rand_state,
                                                const int seed) {
  int id = threadIdx.x + blockIdx.x * blockDim.x;
  /* Each thread gets same seed, a different sequence
  number, no offset */
  if (id < float4_numbers) {
    curand_init(seed, id, 0, &rand_state[id]);
  }
 }

 __global__ static void Reset_List(const int element_numbers, int *list, const int replace_element) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < element_numbers) {
    list[i] = replace_element;
  }
 }

 __global__ static void Reset_List(const int element_numbers, float *list, const float replace_element) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < element_numbers) {
    list[i] = replace_element;
  }
 }

 __global__ static void Sum_Of_List(const int element_numbers, const float *list, float *sum) {
  if (threadIdx.x == 0) {
    sum[0] = 0.;
  }
  __syncthreads();
  float lin = 0.;
  for (int i = threadIdx.x; i < element_numbers; i = i + blockDim.x) {
    lin = lin + list[i];
  }
  atomicAdd(sum, lin);
 }

 __global__ static void Scale_List(const int element_numbers, float *list, float scaler) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < element_numbers) {
    list[i] = list[i] * scaler;
  }
 }

 __global__ static void Copy_List(const int element_numbers, const int *origin_list, int *list) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < element_numbers) {
    list[i] = origin_list[i];
  }
 }
 __global__ static void Copy_List(const int element_numbers, const float *origin_list, float *list) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < element_numbers) {
    list[i] = origin_list[i];
  }
 }

 __global__ static void Print(const size_t size, const float *input_x) {
  for (size_t i = 0; i < size; i++) {
    printf("%f\n", input_x[i]);
  }
  return;
 }
 __global__ static void Print(const size_t size, const int *input_x) {
  for (size_t i = 0; i < size; i++) {
    printf("%d\n", input_x[i]);
  }
  return;
 }

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_SPONGE_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/dihedral/dihedral_atom_energy_impl.cu
@@ -29,8 +29,6 @@ __global__ void DihedralAtomEnergyKernel(int dihedral_numbers, const UNSIGNED_IN
    int atom_k = atom_c[dihedral_i];
    int atom_l = atom_d[dihedral_i];

    int temp_ipn = ipn[dihedral_i];

    float temp_pk = pk[dihedral_i];
    float temp_pn = pn[dihedral_i];
    float temp_gamc = gamc[dihedral_i];
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/dihedral/dihedral_energy_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/dihedral/dihedral_energy_impl.cu
@@ -29,8 +29,6 @@ __global__ void DihedralEnergyKernel(int dihedral_numbers, const UNSIGNED_INT_VE
    int atom_k = atom_c[dihedral_i];
    int atom_l = atom_d[dihedral_i];

    int temp_ipn = ipn[dihedral_i];

    float temp_pk = pk[dihedral_i];
    float temp_pn = pn[dihedral_i];
    float temp_gamc = gamc[dihedral_i];
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/dihedral/dihedral_force_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/dihedral/dihedral_force_impl.cu
@@ -31,7 +31,6 @@ __global__ void DihedralForceKernel(int dihedral_numbers, const UNSIGNED_INT_VEC

    int temp_ipn = ipn[dihedral_i];

    float temp_pk = pk[dihedral_i];
    float temp_pn = pn[dihedral_i];
    float temp_gamc = gamc[dihedral_i];
    float temp_gams = gams[dihedral_i];
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_energy_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_energy_impl.cu
@@ -0,0 +1,102 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_energy_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"

 __global__ void LJ_Energy_CUDA(const int atom_numbers, const NEIGHBOR_LIST *nl, const UINT_VECTOR_LJ_TYPE *uint_crd,
                               const VECTOR *boxlength, const float *LJ_type_A, const float *LJ_type_B,
                               const float cutoff_square, float *lj_ene) {
  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom_i < atom_numbers) {
    NEIGHBOR_LIST nl_i = nl[atom_i];
    int N = nl_i.atom_numbers;
    int atom_j;
    int int_x;
    int int_y;
    int int_z;
    UINT_VECTOR_LJ_TYPE r1 = uint_crd[atom_i], r2;
    VECTOR dr;
    float dr2;
    float dr_2;
    float dr_4;
    float dr_6;
    float ene_lin = 0.;

    int x, y;
    int atom_pair_LJ_type;
    for (int j = threadIdx.y; j < N; j = j + blockDim.y) {
      atom_j = nl_i.atom_serial[j];
      r2 = uint_crd[atom_j];

      int_x = r2.uint_x - r1.uint_x;
      int_y = r2.uint_y - r1.uint_y;
      int_z = r2.uint_z - r1.uint_z;
      dr.x = boxlength[0].x * int_x;
      dr.y = boxlength[0].y * int_y;
      dr.z = boxlength[0].z * int_z;

      dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
      if (dr2 < cutoff_square) {
        dr_2 = 1. / dr2;
        dr_4 = dr_2 * dr_2;
        dr_6 = dr_4 * dr_2;

        y = (r2.LJ_type - r1.LJ_type);
        x = y >> 31;
        y = (y ^ x) - x;
        x = r2.LJ_type + r1.LJ_type;
        r2.LJ_type = (x + y) >> 1;
        x = (x - y) >> 1;
        atom_pair_LJ_type = (r2.LJ_type * (r2.LJ_type + 1) >> 1) + x;

        dr_2 = (0.083333333 * LJ_type_A[atom_pair_LJ_type] * dr_6 - 0.166666666 * LJ_type_B[atom_pair_LJ_type]) * dr_6;
        ene_lin = ene_lin + dr_2;
      }
    }
    atomicAdd(&lj_ene[atom_i], ene_lin);
  }
 }

 void LJEnergy(const int atom_numbers, const float cutoff_square, const int *uint_crd_f, const int *LJtype,
              const float *charge, const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers,
              int *nl_atom_serial, int *nl, const float *d_LJ_A, const float *d_LJ_B, float *d_LJ_energy_atom,
              cudaStream_t stream) {
  VECTOR *scaler = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(scaler_f));
  int max_neighbor_numbers = 800;
  NEIGHBOR_LIST *nl_a = reinterpret_cast<NEIGHBOR_LIST *>(nl);
  construct_neighbor_list_kernel<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
    atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl_a);

  UINT_VECTOR_LJ_TYPE *uint_crd_with_LJ_a = reinterpret_cast<UINT_VECTOR_LJ_TYPE *>(uint_crd_with_LJ);

  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));

  Copy_Crd_To_New_Crd_Start<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, uint_crd, uint_crd_with_LJ_a, LJtype, charge);

  Reset_List<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(atom_numbers, d_LJ_energy_atom, 0.);

  LJ_Energy_CUDA<<<ceilf(static_cast<float>(atom_numbers) / 8), thread_LJ, 0, stream>>>(
    atom_numbers, nl_a, uint_crd_with_LJ_a, scaler, d_LJ_A, d_LJ_B, cutoff_square, d_LJ_energy_atom);

  return;
 }
 void LJEnergy(const int atom_numbers, const float cutoff_square, const int *uint_crd_f, const int *LJtype,
              const float *charge, const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers,
              int *nl_atom_serial, int *nl, const float *d_LJ_A, const float *d_LJ_B, float *d_LJ_energy_atom,
              cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_energy_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_energy_impl.cuh
@@ -0,0 +1,27 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_LJ_LJ_ENERGY_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_LJ_LJ_ENERGY_IMPL_H_

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void LJEnergy(const int atom_numbers, const float cutoff_square, const int *uint_crd_f, const int *LJtype,
              const float *charge, const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers,
              int *nl_atom_serial, int *nl, const float *d_LJ_A, const float *d_LJ_B, float *d_LJ_energy_atom,
              cudaStream_t stream);
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_force_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_force_impl.cu
@@ -0,0 +1,116 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_force_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"

 __global__ void LJ_Force_CUDA(const int atom_numbers, const NEIGHBOR_LIST *nl, const UINT_VECTOR_LJ_TYPE *uint_crd,
                              const VECTOR *boxlength, const float *LJ_type_A, const float *LJ_type_B,
                              const float cutoff_square, VECTOR *frc) {
  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom_i < atom_numbers) {
    NEIGHBOR_LIST nl_i = nl[atom_i];
    int N = nl_i.atom_numbers;
    int B = ceilf(static_cast<float>(N) / blockDim.y);
    int atom_j;
    int int_x;
    int int_y;
    int int_z;
    UINT_VECTOR_LJ_TYPE r1 = uint_crd[atom_i], r2;
    VECTOR dr;
    float dr2;
    float dr_2;
    float dr_4;
    float dr_8;
    float dr_14;
    float frc_abs = 0.;
    VECTOR frc_lin;
    VECTOR frc_record = {0., 0., 0.};

    int x, y;
    int atom_pair_LJ_type;
    for (int j = threadIdx.y * B; j < (threadIdx.y + 1) * B; j = j + 1) {
      if (j < N) {
        atom_j = nl_i.atom_serial[j];
        r2 = uint_crd[atom_j];
        int_x = r2.uint_x - r1.uint_x;
        int_y = r2.uint_y - r1.uint_y;
        int_z = r2.uint_z - r1.uint_z;
        dr.x = boxlength[0].x * int_x;
        dr.y = boxlength[0].y * int_y;
        dr.z = boxlength[0].z * int_z;
        dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
        if (dr2 < cutoff_square) {
          dr_2 = 1. / dr2;
          dr_4 = dr_2 * dr_2;
          dr_8 = dr_4 * dr_4;
          dr_14 = dr_8 * dr_4 * dr_2;

          y = (r2.LJ_type - r1.LJ_type);
          x = y >> 31;
          y = (y ^ x) - x;
          x = r2.LJ_type + r1.LJ_type;
          r2.LJ_type = (x + y) >> 1;
          x = (x - y) >> 1;
          atom_pair_LJ_type = (r2.LJ_type * (r2.LJ_type + 1) >> 1) + x;

          frc_abs = -LJ_type_A[atom_pair_LJ_type] * dr_14 + LJ_type_B[atom_pair_LJ_type] * dr_8;
          frc_lin.x = frc_abs * dr.x;
          frc_lin.y = frc_abs * dr.y;
          frc_lin.z = frc_abs * dr.z;

          frc_record.x = frc_record.x + frc_lin.x;
          frc_record.y = frc_record.y + frc_lin.y;
          frc_record.z = frc_record.z + frc_lin.z;

          atomicAdd(&frc[atom_j].x, -frc_lin.x);
          atomicAdd(&frc[atom_j].y, -frc_lin.y);
          atomicAdd(&frc[atom_j].z, -frc_lin.z);
        }
      }
    }
    atomicAdd(&frc[atom_i].x, frc_record.x);
    atomicAdd(&frc[atom_i].y, frc_record.y);
    atomicAdd(&frc[atom_i].z, frc_record.z);
  }
 }

 void LJForce(const int atom_numbers, const float cutoff_square, const int *uint_crd_f, const int *LJtype,
             const float *charge, const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers,
             int *nl_atom_serial, int *nl, const float *d_LJ_A, const float *d_LJ_B, float *frc_f,
             cudaStream_t stream) {
  VECTOR *frc = reinterpret_cast<VECTOR *>(frc_f);
  VECTOR *scaler = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(scaler_f));
  int max_neighbor_numbers = 800;
  NEIGHBOR_LIST *nl_a = reinterpret_cast<NEIGHBOR_LIST *>(nl);
  construct_neighbor_list_kernel<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
    atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl_a);

  UINT_VECTOR_LJ_TYPE *uint_crd_with_LJ_a = reinterpret_cast<UINT_VECTOR_LJ_TYPE *>(uint_crd_with_LJ);

  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));

  Copy_Crd_To_New_Crd_Start<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, uint_crd, uint_crd_with_LJ_a, LJtype, charge);

  LJ_Force_CUDA<<<ceilf(static_cast<float>(atom_numbers) / 8), thread_LJ, 0, stream>>>(
    atom_numbers, nl_a, uint_crd_with_LJ_a, scaler, d_LJ_A, d_LJ_B, cutoff_square, frc);
  return;
 }
 void LJForce(const int atom_numbers, const float cutoff_square, const int *uint_crd_f, const int *LJtype,
             const float *charge, const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers,
             int *nl_atom_serial, int *nl, const float *d_LJ_A, const float *d_LJ_B, float *frc_f, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_force_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_force_impl.cuh
@@ -0,0 +1,27 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_LJ_LJ_FORCE_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_LJ_LJ_FORCE_IMPL_H_

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void LJForce(const int atom_numbers, const float cutoff_square, const int *uint_crd_f, const int *LJtype,
             const float *charge, const float *scaler_f, float *uint_crd_with_LJ, int *nl_atom_numbers,
             int *nl_atom_serial, int *nl, const float *d_LJ_A, const float *d_LJ_B, float *frc_f, cudaStream_t stream);

 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cu
@@ -0,0 +1,132 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"

 __global__ void LJ_Force_With_Direct_CF_CUDA(const int atom_numbers, const NEIGHBOR_LIST *nl,
                                             const UINT_VECTOR_LJ_TYPE *uint_crd, const VECTOR *boxlength,
                                             const float *LJ_type_A, const float *LJ_type_B, const float cutoff,
                                             VECTOR *frc, const float pme_beta, const float sqrt_pi) {
  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom_i < atom_numbers) {
    NEIGHBOR_LIST nl_i = nl[atom_i];
    int N = nl_i.atom_numbers;
    int atom_j;
    int int_x;
    int int_y;
    int int_z;
    UINT_VECTOR_LJ_TYPE r1 = uint_crd[atom_i], r2;
    VECTOR dr;
    float dr_2;
    float dr_4;
    float dr_8;
    float dr_6;
    float frc_abs = 0.;
    VECTOR frc_lin;
    VECTOR frc_record = {0., 0., 0.};

    float charge_i = r1.charge;
    float charge_j;
    float dr_abs;
    float dr_1;
    float beta_dr;
    float frc_cf_abs;

    int x, y;
    int atom_pair_LJ_type;
    for (int j = threadIdx.y; j < N; j = j + blockDim.y) {
      atom_j = nl_i.atom_serial[j];
      r2 = uint_crd[atom_j];
      charge_j = r2.charge;

      int_x = r2.uint_x - r1.uint_x;
      int_y = r2.uint_y - r1.uint_y;
      int_z = r2.uint_z - r1.uint_z;
      dr.x = boxlength[0].x * int_x;
      dr.y = boxlength[0].y * int_y;
      dr.z = boxlength[0].z * int_z;
      dr_abs = norm3df(dr.x, dr.y, dr.z);
      if (dr_abs < cutoff) {
        dr_1 = 1. / dr_abs;
        dr_2 = dr_1 * dr_1;
        dr_4 = dr_2 * dr_2;
        dr_8 = dr_4 * dr_4;
        dr_6 = dr_4 * dr_2;

        y = (r2.LJ_type - r1.LJ_type);
        x = y >> 31;
        y = (y ^ x) - x;
        x = r2.LJ_type + r1.LJ_type;
        r2.LJ_type = (x + y) >> 1;
        x = (x - y) >> 1;
        atom_pair_LJ_type = (r2.LJ_type * (r2.LJ_type + 1) >> 1) + x;

        frc_abs = (-LJ_type_A[atom_pair_LJ_type] * dr_6 + LJ_type_B[atom_pair_LJ_type]) * dr_8;
        beta_dr = pme_beta * dr_abs;
        frc_cf_abs = beta_dr * sqrt_pi * expf(-beta_dr * beta_dr) + erfcf(beta_dr);
        frc_cf_abs = frc_cf_abs * dr_2 * dr_1;
        frc_cf_abs = charge_i * charge_j * frc_cf_abs;

        frc_abs = frc_abs - frc_cf_abs;

        frc_lin.x = frc_abs * dr.x;
        frc_lin.y = frc_abs * dr.y;
        frc_lin.z = frc_abs * dr.z;

        frc_record.x = frc_record.x + frc_lin.x;
        frc_record.y = frc_record.y + frc_lin.y;
        frc_record.z = frc_record.z + frc_lin.z;

        atomicAdd(&frc[atom_j].x, -frc_lin.x);
        atomicAdd(&frc[atom_j].y, -frc_lin.y);
        atomicAdd(&frc[atom_j].z, -frc_lin.z);
      }
    }
    atomicAdd(&frc[atom_i].x, frc_record.x);
    atomicAdd(&frc[atom_i].y, frc_record.y);
    atomicAdd(&frc[atom_i].z, frc_record.z);
  }
 }

 void LJForceWithPMEDirectForce(const int atom_numbers, const float cutoff, const float pme_beta, const int *uint_crd_f,
                               const int *LJtype, const float *charge, const float *scaler_f, float *uint_crd_with_LJ,
                               int *nl_atom_numbers, int *nl_atom_serial, int *nl, const float *d_LJ_A,
                               const float *d_LJ_B, float *frc_f, cudaStream_t stream) {
  VECTOR *frc = reinterpret_cast<VECTOR *>(frc_f);
  VECTOR *scaler = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(scaler_f));
  int max_neighbor_numbers = 800;
  NEIGHBOR_LIST *nl_a = reinterpret_cast<NEIGHBOR_LIST *>(nl);
  construct_neighbor_list_kernel<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
    atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl_a);

  UINT_VECTOR_LJ_TYPE *uint_crd_with_LJ_a = reinterpret_cast<UINT_VECTOR_LJ_TYPE *>(uint_crd_with_LJ);

  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));

  Copy_Crd_To_New_Crd_Start<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, uint_crd, uint_crd_with_LJ_a, LJtype, charge);

  LJ_Force_With_Direct_CF_CUDA<<<ceilf(static_cast<float>(atom_numbers) / 8), thread_LJ, 0, stream>>>(
    atom_numbers, nl_a, uint_crd_with_LJ_a, scaler, d_LJ_A, d_LJ_B, cutoff, frc, pme_beta, TWO_DIVIDED_BY_SQRT_PI);
  return;
 }

 void LJForceWithPMEDirectForce(const int atom_numbers, const float cutoff, const float pme_beta, const int *uint_crd_f,
                               const int *LJtype, const float *charge, const float *scaler_f, float *uint_crd_with_LJ,
                               int *nl_atom_numbers, int *nl_atom_serial, int *nl, const float *d_LJ_A,
                               const float *d_LJ_B, float *frc_f, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh
@@ -0,0 +1,28 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_LJ_LJ_FORCE_WITH_PME_DIRECT_FORCE_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_LJ_LJ_FORCE_WITH_PME_DIRECT_FORCE_IMPL_H_

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void LJForceWithPMEDirectForce(const int atom_numbers, const float cutoff, const float pme_beta, const int *uint_crd_f,
                               const int *LJtype, const float *charge, const float *scaler_f, float *uint_crd_with_LJ,
                               int *nl_atom_numbers, int *nl_atom_serial, int *nl, const float *d_LJ_A,
                               const float *d_LJ_B, float *frc_f, cudaStream_t stream);

 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cu
@@ -0,0 +1,80 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"

 __global__ void Dihedral14CFAtomEnergyKernel(const int dihedral_14_numbers, const UINT_VECTOR_LJ_TYPE *uint_crd,
                                             const VECTOR *boxlength, const int *a_14, const int *b_14,
                                             const float *cf_scale_factor, float *ene) {
  int dihedral_14_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (dihedral_14_i < dihedral_14_numbers) {
    int atom_i = a_14[dihedral_14_i];
    int atom_j = b_14[dihedral_14_i];

    UINT_VECTOR_LJ_TYPE r1 = uint_crd[atom_i];
    UINT_VECTOR_LJ_TYPE r2 = uint_crd[atom_j];

    int int_x;
    int int_y;
    int int_z;
    VECTOR dr;
    float r_1;
    float ene_lin = 0.;

    int_x = r2.uint_x - r1.uint_x;
    int_y = r2.uint_y - r1.uint_y;
    int_z = r2.uint_z - r1.uint_z;
    dr.x = boxlength[0].x * int_x;
    dr.y = boxlength[0].y * int_y;
    dr.z = boxlength[0].z * int_z;
    r_1 = rnorm3df(dr.x, dr.y, dr.z);

    ene_lin = r1.charge * r2.charge * r_1;

    ene_lin *= cf_scale_factor[dihedral_14_i];

    atomicAdd(&ene[atom_i], ene_lin);
  }
 }

 void Dihedral14CFAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
                            const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14,
                            const int *b_14, const float *cf_scale_factor, float *ene, cudaStream_t stream) {
  size_t thread_per_block = 128;
  size_t block_per_grid = ceilf(static_cast<float>(atom_numbers) / 128);
  UINT_VECTOR_LJ_TYPE *uint_crd_with_LJ = NULL;
  Cuda_Malloc_Safely(reinterpret_cast<void **>(&uint_crd_with_LJ), sizeof(UINT_VECTOR_LJ_TYPE) * atom_numbers);

  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));

  Copy_Crd_To_New_Crd_Start<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, uint_crd, uint_crd_with_LJ, LJtype, charge);

  VECTOR *boxlength = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(boxlength_f));
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128>>>(atom_numbers, ene, 0.);
  Dihedral14CFAtomEnergyKernel<<<block_per_grid, thread_per_block, 0, stream>>>(
    dihedral_14_numbers, uint_crd_with_LJ, boxlength, a_14, b_14, cf_scale_factor, ene);

  cudaStreamSynchronize(stream);

  return;
 }

 void Dihedral14CFAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
                            const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14,
                            const int *b_14, const float *cf_scale_factor, float *ene, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh
@@ -0,0 +1,25 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_CF_ATOM_ENERGY_IMPL_H
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_CF_ATOM_ENERGY_IMPL_H

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void Dihedral14CFAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
                            const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14,
                            const int *b_14, const float *cf_scale_factor, float *ene, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_CF_ENERGY_IMPL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cu
@@ -0,0 +1,80 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"

 __global__ void Dihedral14CFEnergyKernel(const int dihedral_14_numbers, const UINT_VECTOR_LJ_TYPE *uint_crd,
                                         const VECTOR *boxlength, const int *a_14, const int *b_14,
                                         const float *cf_scale_factor, float *ene) {
  int dihedral_14_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (dihedral_14_i < dihedral_14_numbers) {
    int atom_i = a_14[dihedral_14_i];
    int atom_j = b_14[dihedral_14_i];

    UINT_VECTOR_LJ_TYPE r1 = uint_crd[atom_i];
    UINT_VECTOR_LJ_TYPE r2 = uint_crd[atom_j];

    int int_x;
    int int_y;
    int int_z;
    VECTOR dr;
    float r_1;
    float ene_lin = 0.;

    int_x = r2.uint_x - r1.uint_x;
    int_y = r2.uint_y - r1.uint_y;
    int_z = r2.uint_z - r1.uint_z;
    dr.x = boxlength[0].x * int_x;
    dr.y = boxlength[0].y * int_y;
    dr.z = boxlength[0].z * int_z;
    r_1 = rnorm3df(dr.x, dr.y, dr.z);

    ene_lin = r1.charge * r2.charge * r_1;

    ene_lin *= cf_scale_factor[dihedral_14_i];

    ene[dihedral_14_i] = ene_lin;
  }
 }

 void Dihedral14CFEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype,
                        const float *charge, const float *boxlength_f, const int *a_14, const int *b_14,
                        const float *cf_scale_factor, float *ene, cudaStream_t stream) {
  size_t thread_per_block = 128;
  size_t block_per_grid = ceilf(static_cast<float>(atom_numbers) / 128);
  UINT_VECTOR_LJ_TYPE *uint_crd_with_LJ = NULL;
  Cuda_Malloc_Safely(reinterpret_cast<void **>(&uint_crd_with_LJ), sizeof(UINT_VECTOR_LJ_TYPE) * atom_numbers);

  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));

  Copy_Crd_To_New_Crd_Start<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, uint_crd, uint_crd_with_LJ, LJtype, charge);

  VECTOR *boxlength = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(boxlength_f));
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128>>>(atom_numbers, ene, 0.);
  Dihedral14CFEnergyKernel<<<block_per_grid, thread_per_block, 0, stream>>>(
    dihedral_14_numbers, uint_crd_with_LJ, boxlength, a_14, b_14, cf_scale_factor, ene);

  cudaStreamSynchronize(stream);

  return;
 }

 void Dihedral14CFEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype,
                        const float *charge, const float *boxlength_f, const int *a_14, const int *b_14,
                        const float *cf_scale_factor, float *ene, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh
@@ -0,0 +1,25 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_CF_ENERGY_IMPL_H
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_CF_ENERGY_IMPL_H

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void Dihedral14CFEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype,
                        const float *charge, const float *boxlength, const int *a_14, const int *b_14,
                        const float *cf_scale_factor, float *ene, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_CF_ENERGY_IMPL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cu
@@ -0,0 +1,102 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"

 __global__ void Dihedral14LJAtomEnergyKernel(const int dihedral_14_numbers, const UINT_VECTOR_LJ_TYPE *uint_crd,
                                             const VECTOR *boxlength, const int *a_14, const int *b_14,
                                             const float *lj_scale_factor, const float *LJ_type_A,
                                             const float *LJ_type_B, float *ene) {
  int dihedral_14_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (dihedral_14_i < dihedral_14_numbers) {
    int atom_i = a_14[dihedral_14_i];
    int atom_j = b_14[dihedral_14_i];

    UINT_VECTOR_LJ_TYPE r1 = uint_crd[atom_i];
    UINT_VECTOR_LJ_TYPE r2 = uint_crd[atom_j];

    int int_x;
    int int_y;
    int int_z;
    VECTOR dr;
    float dr2;
    float dr_2;
    float dr_4;
    float dr_6;
    float dr_12;
    float ene_lin = 0.;
    int x, y;
    int atom_pair_LJ_type;

    int_x = r2.uint_x - r1.uint_x;
    int_y = r2.uint_y - r1.uint_y;
    int_z = r2.uint_z - r1.uint_z;
    dr.x = boxlength[0].x * int_x;
    dr.y = boxlength[0].y * int_y;
    dr.z = boxlength[0].z * int_z;
    dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;

    dr_2 = 1. / dr2;
    dr_4 = dr_2 * dr_2;
    dr_6 = dr_4 * dr_2;
    dr_12 = dr_6 * dr_6;

    y = (r2.LJ_type - r1.LJ_type);
    x = y >> 31;
    y = (y ^ x) - x;
    x = r2.LJ_type + r1.LJ_type;
    r2.LJ_type = (x + y) >> 1;
    x = (x - y) >> 1;
    atom_pair_LJ_type = (r2.LJ_type * (r2.LJ_type + 1) >> 1) + x;

    ene_lin = 0.08333333 * LJ_type_A[atom_pair_LJ_type] * dr_12 -
              0.1666666 * LJ_type_B[atom_pair_LJ_type] * dr_6;  // LJ的A,B系数已经乘以12和6因此要反乘
    ene_lin *= lj_scale_factor[dihedral_14_i];

    atomicAdd(&ene[atom_i], ene_lin);
  }
 }

 void Dihedral14LJAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
                            const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14,
                            const int *b_14, const float *lj_scale_factor, const float *LJ_type_A,
                            const float *LJ_type_B, float *ene, cudaStream_t stream) {
  size_t thread_per_block = 128;
  size_t block_per_grid = ceilf(static_cast<float>(atom_numbers) / 128);
  UINT_VECTOR_LJ_TYPE *uint_crd_with_LJ = NULL;
  Cuda_Malloc_Safely(reinterpret_cast<void **>(&uint_crd_with_LJ), sizeof(UINT_VECTOR_LJ_TYPE) * atom_numbers);

  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));

  Copy_Crd_To_New_Crd_Start<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, uint_crd, uint_crd_with_LJ, LJtype, charge);

  VECTOR *boxlength = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(boxlength_f));
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128>>>(atom_numbers, ene, 0.);
  Dihedral14LJAtomEnergyKernel<<<block_per_grid, thread_per_block, 0, stream>>>(
    dihedral_14_numbers, uint_crd_with_LJ, boxlength, a_14, b_14, lj_scale_factor, LJ_type_A, LJ_type_B, ene);

  cudaStreamSynchronize(stream);

  return;
 }

 void Dihedral14LJAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
                            const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14,
                            const int *b_14, const float *lj_scale_factor, const float *LJ_type_A,
                            const float *LJ_type_B, float *ene, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh
@@ -0,0 +1,26 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_ATOM_ENERGY_IMPL_H
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_ATOM_ENERGY_IMPL_H

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void Dihedral14LJAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
                            const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14,
                            const int *b_14, const float *lj_scale_factor, const float *LJ_type_A,
                            const float *LJ_type_B, float *ene, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_ATOM_ENERGY_IMPL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cu
@@ -0,0 +1,140 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"

 __global__ void Dihedral14LJCFForceWithAtomEnergyKernel(const int dihedral_14_numbers,
                                                        const UINT_VECTOR_LJ_TYPE *uint_crd, const VECTOR *boxlength,
                                                        const int *a_14, const int *b_14, const float *lj_scale_factor,
                                                        const float *cf_scale_factor, const float *LJ_type_A,
                                                        const float *LJ_type_B, VECTOR *frc, float *atom_energy) {
  int dihedral_14_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (dihedral_14_i < dihedral_14_numbers) {
    int int_x;
    int int_y;
    int int_z;
    UINT_VECTOR_LJ_TYPE r1, r2;
    VECTOR dr;
    float dr_abs;
    float dr2;
    float dr_1;
    float dr_2;
    float dr_4;
    float dr_8;
    float dr_14;
    float frc_abs = 0.;
    VECTOR temp_frc;

    float ene_lin;
    float ene_lin2;

    int x, y;
    int atom_pair_LJ_type;

    int atom_i = a_14[dihedral_14_i];
    int atom_j = b_14[dihedral_14_i];

    r1 = uint_crd[atom_i];
    r2 = uint_crd[atom_j];
    int_x = r2.uint_x - r1.uint_x;
    int_y = r2.uint_y - r1.uint_y;
    int_z = r2.uint_z - r1.uint_z;
    dr.x = boxlength[0].x * int_x;
    dr.y = boxlength[0].y * int_y;
    dr.z = boxlength[0].z * int_z;
    dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;

    dr_2 = 1.0 / dr2;
    dr_4 = dr_2 * dr_2;
    dr_8 = dr_4 * dr_4;
    dr_14 = dr_8 * dr_4 * dr_2;
    dr_abs = norm3df(dr.x, dr.y, dr.z);
    dr_1 = 1. / dr_abs;

    float charge_i = r1.charge;
    float charge_j = r2.charge;
    float frc_cf_abs;
    frc_cf_abs = cf_scale_factor[dihedral_14_i] * dr_2 * dr_1;
    frc_cf_abs = -charge_i * charge_j * frc_cf_abs;

    y = (r2.LJ_type - r1.LJ_type);
    x = y >> 31;
    y = (y ^ x) - x;
    x = r2.LJ_type + r1.LJ_type;
    r2.LJ_type = (x + y) >> 1;
    x = (x - y) >> 1;
    atom_pair_LJ_type = (r2.LJ_type * (r2.LJ_type + 1) >> 1) + x;

    frc_abs = -LJ_type_A[atom_pair_LJ_type] * dr_14 + LJ_type_B[atom_pair_LJ_type] * dr_8;
    frc_abs *= lj_scale_factor[dihedral_14_i];

    frc_abs += frc_cf_abs;
    temp_frc.x = frc_abs * dr.x;
    temp_frc.y = frc_abs * dr.y;
    temp_frc.z = frc_abs * dr.z;

    atomicAdd(&frc[atom_j].x, -temp_frc.x);
    atomicAdd(&frc[atom_j].y, -temp_frc.y);
    atomicAdd(&frc[atom_j].z, -temp_frc.z);
    atomicAdd(&frc[atom_i].x, temp_frc.x);
    atomicAdd(&frc[atom_i].y, temp_frc.y);
    atomicAdd(&frc[atom_i].z, temp_frc.z);

    ene_lin = r1.charge * r2.charge * dr_1;
    ene_lin *= cf_scale_factor[dihedral_14_i];
    ene_lin2 = 0.08333333 * LJ_type_A[atom_pair_LJ_type] * dr_4 * dr_8 -
               0.1666666 * LJ_type_B[atom_pair_LJ_type] * dr_4 * dr_2;  // LJ的A,B系数已经乘以12和6因此要反乘
    ene_lin2 *= lj_scale_factor[dihedral_14_i];

    atomicAdd(&atom_energy[atom_i], ene_lin + ene_lin2);
  }
 }

 void Dihedral14LJCFForceWithAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
                                       const int *LJtype, const float *charge, const float *boxlength_f,
                                       const int *a_14, const int *b_14, const float *lj_scale_factor,
                                       const float *cf_scale_factor, const float *LJ_type_A, const float *LJ_type_B,
                                       float *frc_f, float *atom_energy, cudaStream_t stream) {
  size_t thread_per_block = 128;
  size_t block_per_grid = ceilf(static_cast<float>(atom_numbers) / 128);
  UINT_VECTOR_LJ_TYPE *uint_crd_with_LJ = NULL;
  Cuda_Malloc_Safely(reinterpret_cast<void **>(&uint_crd_with_LJ), sizeof(UINT_VECTOR_LJ_TYPE) * atom_numbers);

  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));

  Copy_Crd_To_New_Crd_Start<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, uint_crd, uint_crd_with_LJ, LJtype, charge);
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128>>>(3 * atom_numbers, frc_f, 0.);
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128>>>(atom_numbers, atom_energy, 0.);
  VECTOR *boxlength = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(boxlength_f));
  VECTOR *frc = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(frc_f));

  Dihedral14LJCFForceWithAtomEnergyKernel<<<block_per_grid, thread_per_block, 0, stream>>>(
    dihedral_14_numbers, uint_crd_with_LJ, boxlength, a_14, b_14, lj_scale_factor, cf_scale_factor, LJ_type_A,
    LJ_type_B, frc, atom_energy);

  cudaStreamSynchronize(stream);

  return;
 }

 void Dihedral14LJForceWithDirectCF(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
                                   const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14,
                                   const int *b_14, const float *lj_scale_factor, const float *cf_scale_factor,
                                   const float *LJ_type_A, const float *LJ_type_B, float *frc, float *atom_energy,
                                   cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh
@@ -0,0 +1,27 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_IMPL_H
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_IMPL_H

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void Dihedral14LJCFForceWithAtomEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
                                       const int *LJtype, const float *charge, const float *boxlength_f,
                                       const int *a_14, const int *b_14, const float *lj_scale_factor,
                                       const float *cf_scale_factor, const float *LJ_type_A, const float *LJ_type_B,
                                       float *frc, float *atom_energy, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_IMPL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cu
@@ -0,0 +1,102 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"

 __global__ void Dihedral14LJEnergyKernel(const int dihedral_14_numbers, const UINT_VECTOR_LJ_TYPE *uint_crd,
                                         const VECTOR *boxlength, const int *a_14, const int *b_14,
                                         const float *lj_scale_factor, const float *LJ_type_A, const float *LJ_type_B,
                                         float *ene) {
  int dihedral_14_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (dihedral_14_i < dihedral_14_numbers) {
    int atom_i = a_14[dihedral_14_i];
    int atom_j = b_14[dihedral_14_i];

    UINT_VECTOR_LJ_TYPE r1 = uint_crd[atom_i];
    UINT_VECTOR_LJ_TYPE r2 = uint_crd[atom_j];

    int int_x;
    int int_y;
    int int_z;
    VECTOR dr;
    float dr2;
    float dr_2;
    float dr_4;
    float dr_6;
    float dr_12;
    float ene_lin = 0.;
    int x, y;
    int atom_pair_LJ_type;

    int_x = r2.uint_x - r1.uint_x;
    int_y = r2.uint_y - r1.uint_y;
    int_z = r2.uint_z - r1.uint_z;
    dr.x = boxlength[0].x * int_x;
    dr.y = boxlength[0].y * int_y;
    dr.z = boxlength[0].z * int_z;
    dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;

    dr_2 = 1. / dr2;
    dr_4 = dr_2 * dr_2;
    dr_6 = dr_4 * dr_2;
    dr_12 = dr_6 * dr_6;

    y = (r2.LJ_type - r1.LJ_type);
    x = y >> 31;
    y = (y ^ x) - x;
    x = r2.LJ_type + r1.LJ_type;
    r2.LJ_type = (x + y) >> 1;
    x = (x - y) >> 1;
    atom_pair_LJ_type = (r2.LJ_type * (r2.LJ_type + 1) >> 1) + x;

    ene_lin = 0.08333333 * LJ_type_A[atom_pair_LJ_type] * dr_12 -
              0.1666666 * LJ_type_B[atom_pair_LJ_type] * dr_6;  // LJ的A,B系数已经乘以12和6因此要反乘
    ene_lin *= lj_scale_factor[dihedral_14_i];

    ene[dihedral_14_i] = ene_lin;
  }
 }

 void Dihedral14LJEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype,
                        const float *charge, const float *boxlength_f, const int *a_14, const int *b_14,
                        const float *lj_scale_factor, const float *LJ_type_A, const float *LJ_type_B, float *ene,
                        cudaStream_t stream) {
  size_t thread_per_block = 128;
  size_t block_per_grid = ceilf(static_cast<float>(atom_numbers) / 128);
  UINT_VECTOR_LJ_TYPE *uint_crd_with_LJ = NULL;
  Cuda_Malloc_Safely(reinterpret_cast<void **>(&uint_crd_with_LJ), sizeof(UINT_VECTOR_LJ_TYPE) * atom_numbers);

  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));

  Copy_Crd_To_New_Crd_Start<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, uint_crd, uint_crd_with_LJ, LJtype, charge);
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128>>>(dihedral_14_numbers, ene, 0.);
  VECTOR *boxlength = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(boxlength_f));

  Dihedral14LJEnergyKernel<<<block_per_grid, thread_per_block, 0, stream>>>(
    dihedral_14_numbers, uint_crd_with_LJ, boxlength, a_14, b_14, lj_scale_factor, LJ_type_A, LJ_type_B, ene);

  cudaStreamSynchronize(stream);

  return;
 }

 void Dihedral14LJEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype,
                        const float *charge, const float *boxlength_f, const int *a_14, const int *b_14,
                        const float *lj_scale_factor, const float *LJ_type_A, const float *LJ_type_B, float *ene,
                        cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh
@@ -0,0 +1,27 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_ENERGY_IMPL_H
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_ENERGY_IMPL_H

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void Dihedral14LJEnergy(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype,
                        const float *charge, const float *boxlength_f, const int *a_14, const int *b_14,
                        const float *lj_scale_factor, const float *LJ_type_A, const float *LJ_type_B, float *ene,
                        cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_ENERGY_IMPL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cu
@@ -0,0 +1,111 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"

 __global__ void Dihedral14LJForceKernel(const int dihedral_14_numbers, const UINT_VECTOR_LJ_TYPE *uint_crd,
                                        const VECTOR *boxlength, const int *a_14, const int *b_14,
                                        const float *lj_scale_factor, const float *LJ_type_A, const float *LJ_type_B,
                                        VECTOR *frc) {
  int dihedral_14_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (dihedral_14_i < dihedral_14_numbers) {
    int int_x;
    int int_y;
    int int_z;
    UINT_VECTOR_LJ_TYPE r1, r2;
    VECTOR dr;
    float dr2;
    float dr_2;
    float dr_4;
    float dr_8;
    float dr_14;
    float frc_abs = 0.;
    VECTOR temp_frc;
    int x, y;
    int atom_pair_LJ_type;

    int atom_i = a_14[dihedral_14_i];
    int atom_j = b_14[dihedral_14_i];

    r1 = uint_crd[atom_i];
    r2 = uint_crd[atom_j];

    int_x = r2.uint_x - r1.uint_x;
    int_y = r2.uint_y - r1.uint_y;
    int_z = r2.uint_z - r1.uint_z;
    dr.x = boxlength[0].x * int_x;
    dr.y = boxlength[0].y * int_y;
    dr.z = boxlength[0].z * int_z;
    dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;

    dr_2 = 1.0 / dr2;
    dr_4 = dr_2 * dr_2;
    dr_8 = dr_4 * dr_4;
    dr_14 = dr_8 * dr_4 * dr_2;

    y = (r2.LJ_type - r1.LJ_type);
    x = y >> 31;
    y = (y ^ x) - x;
    x = r2.LJ_type + r1.LJ_type;
    r2.LJ_type = (x + y) >> 1;
    x = (x - y) >> 1;
    atom_pair_LJ_type = (r2.LJ_type * (r2.LJ_type + 1) >> 1) + x;

    frc_abs = -LJ_type_A[atom_pair_LJ_type] * dr_14 + LJ_type_B[atom_pair_LJ_type] * dr_8;
    frc_abs *= lj_scale_factor[dihedral_14_i];
    temp_frc.x = frc_abs * dr.x;
    temp_frc.y = frc_abs * dr.y;
    temp_frc.z = frc_abs * dr.z;

    atomicAdd(&frc[atom_j].x, -temp_frc.x);
    atomicAdd(&frc[atom_j].y, -temp_frc.y);
    atomicAdd(&frc[atom_j].z, -temp_frc.z);
    atomicAdd(&frc[atom_i].x, temp_frc.x);
    atomicAdd(&frc[atom_i].y, temp_frc.y);
    atomicAdd(&frc[atom_i].z, temp_frc.z);
  }
 }

 void Dihedral14LJForce(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype,
                       const float *charge, const float *boxlength_f, const int *a_14, const int *b_14,
                       const float *lj_scale_factor, const float *LJ_type_A, const float *LJ_type_B, float *frc_f,
                       cudaStream_t stream) {
  size_t thread_per_block = 128;
  size_t block_per_grid = ceilf(static_cast<float>(atom_numbers) / 128);
  UINT_VECTOR_LJ_TYPE *uint_crd_with_LJ = NULL;
  Cuda_Malloc_Safely(reinterpret_cast<void **>(&uint_crd_with_LJ), sizeof(UINT_VECTOR_LJ_TYPE) * atom_numbers);

  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));

  Copy_Crd_To_New_Crd_Start<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, uint_crd, uint_crd_with_LJ, LJtype, charge);
  cudaStreamSynchronize(stream);
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128, 0, stream>>>(3 * atom_numbers, frc_f, 0.);
  VECTOR *boxlength = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(boxlength_f));
  VECTOR *frc = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(frc_f));

  Dihedral14LJForceKernel<<<block_per_grid, thread_per_block, 0, stream>>>(
    dihedral_14_numbers, uint_crd_with_LJ, boxlength, a_14, b_14, lj_scale_factor, LJ_type_A, LJ_type_B, frc);
  cudaStreamSynchronize(stream);
  return;
 }

 void Dihedral14LJForce(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype,
                       const float *charge, const float *boxlength_f, const int *a_14, const int *b_14,
                       const float *lj_scale_factor, const float *LJ_type_A, const float *LJ_type_B, float *frc_f,
                       cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh
@@ -0,0 +1,26 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_FORCE_IMPL_H
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_FORCE_IMPL_H

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void Dihedral14LJForce(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f, const int *LJtype,
                       const float *charge, const float *boxlength_f, const int *a_14, const int *b_14,
                       const float *lj_scale_factor, const float *LJ_type_A, const float *LJ_type_B, float *frc_f,
                       cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_FORCE_IMPL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cu
@@ -0,0 +1,124 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"

 __global__ void Dihedral14LJForceWithDirectCFKernel(const int dihedral_14_numbers, const UINT_VECTOR_LJ_TYPE *uint_crd,
                                                    const VECTOR *boxlength, const int *a_14, const int *b_14,
                                                    const float *lj_scale_factor, const float *cf_scale_factor,
                                                    const float *LJ_type_A, const float *LJ_type_B, VECTOR *frc) {
  int dihedral_14_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (dihedral_14_i < dihedral_14_numbers) {
    int int_x;
    int int_y;
    int int_z;
    UINT_VECTOR_LJ_TYPE r1, r2;
    VECTOR dr;
    float dr_abs;
    float dr2;
    float dr_1;
    float dr_2;
    float dr_4;
    float dr_8;
    float dr_14;
    float frc_abs = 0.;
    VECTOR temp_frc;

    int x, y;
    int atom_pair_LJ_type;

    int atom_i = a_14[dihedral_14_i];
    int atom_j = b_14[dihedral_14_i];

    r1 = uint_crd[atom_i];
    r2 = uint_crd[atom_j];
    int_x = r2.uint_x - r1.uint_x;
    int_y = r2.uint_y - r1.uint_y;
    int_z = r2.uint_z - r1.uint_z;
    dr.x = boxlength[0].x * int_x;
    dr.y = boxlength[0].y * int_y;
    dr.z = boxlength[0].z * int_z;
    dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;

    dr_2 = 1.0 / dr2;
    dr_4 = dr_2 * dr_2;
    dr_8 = dr_4 * dr_4;
    dr_14 = dr_8 * dr_4 * dr_2;
    dr_abs = norm3df(dr.x, dr.y, dr.z);
    dr_1 = 1. / dr_abs;

    float charge_i = r1.charge;
    float charge_j = r2.charge;
    float frc_cf_abs;
    frc_cf_abs = cf_scale_factor[dihedral_14_i] * dr_2 * dr_1;
    frc_cf_abs = -charge_i * charge_j * frc_cf_abs;
    // LJ
    y = (r2.LJ_type - r1.LJ_type);
    x = y >> 31;
    y = (y ^ x) - x;
    x = r2.LJ_type + r1.LJ_type;
    r2.LJ_type = (x + y) >> 1;
    x = (x - y) >> 1;
    atom_pair_LJ_type = (r2.LJ_type * (r2.LJ_type + 1) >> 1) + x;

    frc_abs = -LJ_type_A[atom_pair_LJ_type] * dr_14 + LJ_type_B[atom_pair_LJ_type] * dr_8;
    frc_abs *= lj_scale_factor[dihedral_14_i];

    frc_abs += frc_cf_abs;
    temp_frc.x = frc_abs * dr.x;
    temp_frc.y = frc_abs * dr.y;
    temp_frc.z = frc_abs * dr.z;

    atomicAdd(&frc[atom_j].x, -temp_frc.x);
    atomicAdd(&frc[atom_j].y, -temp_frc.y);
    atomicAdd(&frc[atom_j].z, -temp_frc.z);
    atomicAdd(&frc[atom_i].x, temp_frc.x);
    atomicAdd(&frc[atom_i].y, temp_frc.y);
    atomicAdd(&frc[atom_i].z, temp_frc.z);
  }
 }

 void Dihedral14LJForceWithDirectCF(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
                                   const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14,
                                   const int *b_14, const float *lj_scale_factor, const float *cf_scale_factor,
                                   const float *LJ_type_A, const float *LJ_type_B, float *frc_f, cudaStream_t stream) {
  size_t thread_per_block = 128;
  size_t block_per_grid = ceilf(static_cast<float>(atom_numbers) / 128);
  UINT_VECTOR_LJ_TYPE *uint_crd_with_LJ = NULL;
  Cuda_Malloc_Safely(reinterpret_cast<void **>(&uint_crd_with_LJ), sizeof(UINT_VECTOR_LJ_TYPE) * atom_numbers);

  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));

  Copy_Crd_To_New_Crd_Start<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, uint_crd, uint_crd_with_LJ, LJtype, charge);
  cudaStreamSynchronize(stream);
  VECTOR *boxlength = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(boxlength_f));
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128>>>(3 * atom_numbers, frc_f, 0.);
  VECTOR *frc = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(frc_f));

  Dihedral14LJForceWithDirectCFKernel<<<block_per_grid, thread_per_block, 0, stream>>>(
    dihedral_14_numbers, uint_crd_with_LJ, boxlength, a_14, b_14, lj_scale_factor, cf_scale_factor, LJ_type_A,
    LJ_type_B, frc);

  return;
 }

 void Dihedral14LJForceWithDirectCF(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
                                   const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14,
                                   const int *b_14, const float *lj_scale_factor, const float *cf_scale_factor,
                                   const float *LJ_type_A, const float *LJ_type_B, float *frc_f, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh
@@ -0,0 +1,26 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_FORCE_WITH_DIRECT_CF_IMPL_H
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_FORCE_WITH_DIRECT_CF_IMPL_H

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void Dihedral14LJForceWithDirectCF(const int dihedral_14_numbers, const int atom_numbers, const int *uint_crd_f,
                                   const int *LJtype, const float *charge, const float *boxlength_f, const int *a_14,
                                   const int *b_14, const float *lj_scale_factor, const float *cf_scale_factor,
                                   const float *LJ_type_A, const float *LJ_type_B, float *frc_f, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_FORCE_WITH_DIRECT_CF_IMPL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cu
@@ -0,0 +1,419 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh"

 __global__ void Copy_List(const int element_numbers, const int *origin_list, int *list) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < element_numbers) {
    list[i] = origin_list[i];
  }
 }
 __global__ void Copy_List(const int element_numbers, const float *origin_list, float *list) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < element_numbers) {
    list[i] = origin_list[i];
  }
 }

 __global__ void Crd_To_Uint_Crd(const int atom_numbers, float *scale_factor, const VECTOR *crd,
                                UNSIGNED_INT_VECTOR *uint_crd) {
  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom_i < atom_numbers) {
    uint_crd[atom_i].uint_x = crd[atom_i].x * scale_factor[0];
    uint_crd[atom_i].uint_y = crd[atom_i].y * scale_factor[1];
    uint_crd[atom_i].uint_z = crd[atom_i].z * scale_factor[2];
    /*uint_crd[atom_i].uint_x = 2 * uint_crd[atom_i].uint_x;
    uint_crd[atom_i].uint_y = 2 * uint_crd[atom_i].uint_y;
    uint_crd[atom_i].uint_z = 2 * uint_crd[atom_i].uint_z;*/
    uint_crd[atom_i].uint_x = uint_crd[atom_i].uint_x << 1;
    uint_crd[atom_i].uint_y = uint_crd[atom_i].uint_y << 1;
    uint_crd[atom_i].uint_z = uint_crd[atom_i].uint_z << 1;
  }
 }

 __global__ void Vector_Translation(const int vector_numbers, VECTOR *vec_list, const VECTOR translation_vec) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < vector_numbers) {
    vec_list[i].x = vec_list[i].x + translation_vec.x;
    vec_list[i].y = vec_list[i].y + translation_vec.y;
    vec_list[i].z = vec_list[i].z + translation_vec.z;
  }
 }
 __global__ void Vector_Translation(const int vector_numbers, VECTOR *vec_list, const VECTOR *translation_vec) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < vector_numbers) {
    vec_list[i].x = vec_list[i].x + translation_vec[0].x;
    vec_list[i].y = vec_list[i].y + translation_vec[0].y;
    vec_list[i].z = vec_list[i].z + translation_vec[0].z;
  }
 }
 __global__ void Crd_Periodic_Map(const int atom_numbers, VECTOR *crd, const float *box_length) {
  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom_i < atom_numbers) {
    if (crd[atom_i].x >= 0) {
      if (crd[atom_i].x < box_length[0]) {
      } else {
        crd[atom_i].x = crd[atom_i].x - box_length[0];
      }
    } else {
      crd[atom_i].x = crd[atom_i].x + box_length[0];
    }

    if (crd[atom_i].y >= 0) {
      if (crd[atom_i].y < box_length[1]) {
      } else {
        crd[atom_i].y = crd[atom_i].y - box_length[1];
      }
    } else {
      crd[atom_i].y = crd[atom_i].y + box_length[1];
    }

    if (crd[atom_i].z >= 0) {
      if (crd[atom_i].z < box_length[2]) {
      } else {
        crd[atom_i].z = crd[atom_i].z - box_length[2];
      }
    } else {
      crd[atom_i].z = crd[atom_i].z + box_length[2];
    }
  }
 }

 __global__ void Clear_Grid_Bucket(const int grid_numbers, int *atom_numbers_in_grid_bucket, GRID_BUCKET *bucket) {
  int grid_serial = blockDim.x * blockIdx.x + threadIdx.x;
  if (grid_serial < grid_numbers) {
    GRID_BUCKET bucket_i = bucket[grid_serial];
    for (int i = 0; i < atom_numbers_in_grid_bucket[grid_serial]; i = i + 1) {
      bucket_i.atom_serial[i] = -1;
    }
    atom_numbers_in_grid_bucket[grid_serial] = 0;
  }
 }

 __global__ void Find_Atom_In_Grid_Serial(const int atom_numbers, const float *grid_length_inverse, const VECTOR *crd,
                                         const int *grid_N, const int gridxy, int *atom_in_grid_serial) {
  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom_i < atom_numbers) {
    int Nx = static_cast<float>(crd[atom_i].x) * grid_length_inverse[0];  // crd.x must < boxlength.x
    int Ny = static_cast<float>(crd[atom_i].y) * grid_length_inverse[1];
    int Nz = static_cast<float>(crd[atom_i].z) * grid_length_inverse[2];
    Nx = Nx & ((Nx - grid_N[0]) >> 31);
    Ny = Ny & ((Ny - grid_N[1]) >> 31);
    Nz = Nz & ((Nz - grid_N[2]) >> 31);
    atom_in_grid_serial[atom_i] = Nz * gridxy + Ny * grid_N[0] + Nx;
  }
 }

 __global__ void Put_Atom_In_Grid_Bucket(const int atom_numbers, const int *atom_in_grid_serial, GRID_BUCKET *bucket,
                                        int *atom_numbers_in_grid_bucket) {
  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom_i < atom_numbers) {
    int grid_serial = atom_in_grid_serial[atom_i];
    GRID_BUCKET bucket_i = bucket[grid_serial];
    int a = atom_numbers_in_grid_bucket[grid_serial];
    atomicCAS(&bucket_i.atom_serial[a], -1, atom_i);
    if (bucket_i.atom_serial[a] != atom_i) {
      while (true) {
        a = a + 1;
        atomicCAS(&bucket_i.atom_serial[a], -1, atom_i);
        if (bucket_i.atom_serial[a] == atom_i) {
          atomicAdd(&atom_numbers_in_grid_bucket[grid_serial], 1);
          break;
        }
      }
    } else {
      atomicAdd(&atom_numbers_in_grid_bucket[grid_serial], 1);
    }
  }
 }
 __global__ void Find_atom_neighbors(const int atom_numbers, const UNSIGNED_INT_VECTOR *uint_crd,
                                    const float *uint_dr_to_dr_cof, const int *atom_in_grid_serial,
                                    const GRID_POINTER *gpointer, const GRID_BUCKET *bucket,
                                    const int *atom_numbers_in_grid_bucket, NEIGHBOR_LIST *nl,
                                    const float cutoff_skin_square) {
  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom_i < atom_numbers) {
    int grid_serial = atom_in_grid_serial[atom_i];
    int grid_serial2;
    int atom_numbers_in_nl_lin = 0;
    int atom_j;
    int int_x;
    int int_y;
    int int_z;
    UNSIGNED_INT_VECTOR uint_crd_i = uint_crd[atom_i];
    NEIGHBOR_LIST nl_i = nl[atom_i];
    GRID_POINTER gpointer_i = gpointer[grid_serial];
    VECTOR dr;
    float dr2;
    for (int grid_cycle = 0; grid_cycle < 125; grid_cycle = grid_cycle + 1) {
      grid_serial2 = gpointer_i.grid_serial[grid_cycle];
      GRID_BUCKET bucket_i = bucket[grid_serial2];
      for (int i = 0; i < atom_numbers_in_grid_bucket[grid_serial2]; i = i + 1) {
        atom_j = bucket_i.atom_serial[i];
        if (atom_j > atom_i) {
          int_x = uint_crd[atom_j].uint_x - uint_crd_i.uint_x;
          int_y = uint_crd[atom_j].uint_y - uint_crd_i.uint_y;
          int_z = uint_crd[atom_j].uint_z - uint_crd_i.uint_z;
          dr.x = uint_dr_to_dr_cof[0] * int_x;
          dr.y = uint_dr_to_dr_cof[1] * int_y;
          dr.z = uint_dr_to_dr_cof[2] * int_z;
          dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
          if (dr2 < cutoff_skin_square) {
            nl_i.atom_serial[atom_numbers_in_nl_lin] = atom_j;
            atom_numbers_in_nl_lin = atom_numbers_in_nl_lin + 1;
          }
        }
      }
    }  // 124 grid cycle
    nl[atom_i].atom_numbers = atom_numbers_in_nl_lin;
  }
 }

 __global__ void Is_need_refresh_neighbor_list_cuda(const int atom_numbers, const VECTOR *crd, const VECTOR *old_crd,
                                                   const float half_skin_square, int *need_refresh_flag) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < atom_numbers) {
    VECTOR r1 = crd[i];
    VECTOR r2 = old_crd[i];
    r1.x = r1.x - r2.x;
    r1.y = r1.y - r2.y;
    r1.z = r1.z - r2.z;
    float r1_2 = r1.x * r1.x + r1.y * r1.y + r1.z * r1.z;
    if (r1_2 > half_skin_square) {
      atomicExch(&need_refresh_flag[0], 1);
    }
  }
 }

 __global__ void Delete_Excluded_Atoms_Serial_In_Neighbor_List(const int atom_numbers, NEIGHBOR_LIST *nl,
                                                              const int *excluded_list_start, const int *excluded_list,
                                                              const int *excluded_atom_numbers) {
  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom_i < atom_numbers) {
    int excluded_number = excluded_atom_numbers[atom_i];
    if (excluded_number > 0) {
      int list_start = excluded_list_start[atom_i];
      int atom_min = excluded_list[list_start];
      int list_end = list_start + excluded_number;
      int atom_max = excluded_list[list_end - 1];
      NEIGHBOR_LIST nl_i = nl[atom_i];
      int atomnumbers_in_nl_lin = nl_i.atom_numbers;
      int atom_j;
      int excluded_atom_numbers_lin = list_end - list_start;
      int excluded_atom_numbers_count = 0;
      for (int i = 0; i < atomnumbers_in_nl_lin; i = i + 1) {
        atom_j = nl_i.atom_serial[i];
        if (atom_j < atom_min || atom_j > atom_max) {
          continue;
        } else {
          for (int j = list_start; j < list_end; j = j + 1) {
            if (atom_j == excluded_list[j]) {
              atomnumbers_in_nl_lin = atomnumbers_in_nl_lin - 1;
              nl_i.atom_serial[i] = nl_i.atom_serial[atomnumbers_in_nl_lin];
              excluded_atom_numbers_count = excluded_atom_numbers_count + 1;
              i = i - 1;
            }
          }
          if (excluded_atom_numbers_count < excluded_atom_numbers_lin) {
          } else {
            break;
          }  // break
        }    // in the range of excluded min to max
      }      // cycle for neighbors
      nl[atom_i].atom_numbers = atomnumbers_in_nl_lin;
    }  // if need excluded
  }
 }

 void Refresh_Neighbor_List(int *refresh_sign, const int thread, const int atom_numbers, VECTOR *crd, VECTOR *old_crd,
                           UNSIGNED_INT_VECTOR *uint_crd, float *crd_to_uint_crd_cof, float *uint_dr_to_dr_cof,
                           int *atom_in_grid_serial, const float skin, float *box_length, const GRID_POINTER *gpointer,
                           GRID_BUCKET *bucket, int *atom_numbers_in_grid_bucket, NEIGHBOR_LIST *d_nl,
                           int *excluded_list_start, int *excluded_list, int *excluded_numbers,
                           float cutoff_skin_square, int grid_numbers, float *grid_length_inverse, int *grid_N, int Nxy,
                           cudaStream_t stream) {
  if (refresh_sign[0] == 1) {
    VECTOR trans_vec = {-skin, -skin, -skin};
    Clear_Grid_Bucket<<<ceilf(static_cast<float>(grid_numbers) / thread), thread, 0, stream>>>(
      grid_numbers, atom_numbers_in_grid_bucket, bucket);

    Vector_Translation<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(atom_numbers, crd,
                                                                                                trans_vec);

    Crd_Periodic_Map<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(atom_numbers, crd,
                                                                                              box_length);

    Find_Atom_In_Grid_Serial<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(
      atom_numbers, grid_length_inverse, crd, grid_N, Nxy, atom_in_grid_serial);

    trans_vec.x = -trans_vec.x;
    trans_vec.y = -trans_vec.y;
    trans_vec.z = -trans_vec.z;

    Vector_Translation<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(atom_numbers, crd,
                                                                                                trans_vec);

    Copy_List<<<ceilf(static_cast<float>(3. * atom_numbers) / thread), thread, 0, stream>>>(
      3 * atom_numbers, reinterpret_cast<float *>(crd), reinterpret_cast<float *>(old_crd));

    Put_Atom_In_Grid_Bucket<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(
      atom_numbers, atom_in_grid_serial, bucket, atom_numbers_in_grid_bucket);

    Crd_To_Uint_Crd<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(
      atom_numbers, crd_to_uint_crd_cof, crd, uint_crd);

    Find_atom_neighbors<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(
      atom_numbers, uint_crd, uint_dr_to_dr_cof, atom_in_grid_serial, gpointer, bucket, atom_numbers_in_grid_bucket,
      d_nl, cutoff_skin_square);

    Delete_Excluded_Atoms_Serial_In_Neighbor_List<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0,
                                                    stream>>>(atom_numbers, d_nl, excluded_list_start, excluded_list,
                                                              excluded_numbers);
    refresh_sign[0] = 0;
  }
 }

 void Refresh_Neighbor_List_First_Time(int *refresh_sign, const int thread, const int atom_numbers, VECTOR *crd,
                                      VECTOR *old_crd, UNSIGNED_INT_VECTOR *uint_crd, float *crd_to_uint_crd_cof,
                                      float *uint_dr_to_dr_cof, int *atom_in_grid_serial, const float skin,
                                      float *box_length, const GRID_POINTER *gpointer, GRID_BUCKET *bucket,
                                      int *atom_numbers_in_grid_bucket, NEIGHBOR_LIST *d_nl, int *excluded_list_start,
                                      int *excluded_list, int *excluded_numbers, float cutoff_skin_square,
                                      int grid_numbers, float *grid_length_inverse, int *grid_N, int Nxy,
                                      cudaStream_t stream) {
  VECTOR trans_vec = {skin, skin, skin};
  Clear_Grid_Bucket<<<ceilf(static_cast<float>(grid_numbers) / 32), 32, 0, stream>>>(
    grid_numbers, atom_numbers_in_grid_bucket, bucket);
  Crd_Periodic_Map<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(atom_numbers, crd, box_length);
  Find_Atom_In_Grid_Serial<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, grid_length_inverse, crd, grid_N, Nxy, atom_in_grid_serial);
  Vector_Translation<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(atom_numbers, crd, trans_vec);
  Copy_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 32), 32, 0, stream>>>(
    3 * atom_numbers, reinterpret_cast<float *>(crd), reinterpret_cast<float *>(old_crd));
  Put_Atom_In_Grid_Bucket<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, atom_in_grid_serial, bucket, atom_numbers_in_grid_bucket);
  Crd_To_Uint_Crd<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(atom_numbers, crd_to_uint_crd_cof,
                                                                                   crd, uint_crd);

  Find_atom_neighbors<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(
    atom_numbers, uint_crd, uint_dr_to_dr_cof, atom_in_grid_serial, gpointer, bucket, atom_numbers_in_grid_bucket, d_nl,
    cutoff_skin_square);
  Delete_Excluded_Atoms_Serial_In_Neighbor_List<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0,
                                                  stream>>>(atom_numbers, d_nl, excluded_list_start, excluded_list,
                                                            excluded_numbers);
 }

 __global__ void construct_neighbor_list_kernel(int atom_numbers, int max_neighbor_numbers, int *nl_atom_numbers,
                                               int *nl_atom_serial, NEIGHBOR_LIST *nl) {
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < atom_numbers; i += gridDim.x * blockDim.x) {
    nl[i].atom_numbers = nl_atom_numbers[i];
    nl[i].atom_serial = nl_atom_serial + i * max_neighbor_numbers;
  }
 }

 void Construct_Neighbor_List(int atom_numbers, int max_neighbor_numbers, int *nl_atom_numbers, int *nl_atom_serial,
                             NEIGHBOR_LIST *nl, cudaStream_t stream) {
  construct_neighbor_list_kernel<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
    atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl);
 }

 void Refresh_Neighbor_List_No_Check(int grid_numbers, int atom_numbers, float skin, int Nxy, float cutoff_skin_square,
                                    int *grid_N, float *box_length, int *atom_numbers_in_grid_bucket,
                                    float *grid_length_inverse, int *atom_in_grid_serial, GRID_BUCKET *bucket,
                                    VECTOR *crd, VECTOR *old_crd, float *crd_to_uint_crd_cof,
                                    UNSIGNED_INT_VECTOR *uint_crd, float *uint_dr_to_dr_cof, GRID_POINTER *gpointer,
                                    NEIGHBOR_LIST *d_nl, int *excluded_list_start, int *excluded_list,
                                    int *excluded_numbers, cudaStream_t stream) {
  VECTOR trans_vec = {-skin, -skin, -skin};

  Clear_Grid_Bucket<<<ceilf(static_cast<float>(grid_numbers) / 32), 32, 0, stream>>>(
    grid_numbers, atom_numbers_in_grid_bucket, bucket);

  Vector_Translation<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(atom_numbers, crd, trans_vec);

  Crd_Periodic_Map<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(atom_numbers, crd, box_length);

  Find_Atom_In_Grid_Serial<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, grid_length_inverse, crd, grid_N, Nxy, atom_in_grid_serial);
  trans_vec.x = -trans_vec.x;
  trans_vec.y = -trans_vec.y;
  trans_vec.z = -trans_vec.z;
  Vector_Translation<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(atom_numbers, crd, trans_vec);

  cudaMemcpyAsync(old_crd, crd, sizeof(VECTOR) * atom_numbers, cudaMemcpyDeviceToDevice, stream);

  Put_Atom_In_Grid_Bucket<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, atom_in_grid_serial, bucket, atom_numbers_in_grid_bucket);

  Crd_To_Uint_Crd<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(atom_numbers, crd_to_uint_crd_cof,
                                                                                   crd, uint_crd);

  Find_atom_neighbors<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, uint_crd, uint_dr_to_dr_cof, atom_in_grid_serial, gpointer, bucket, atom_numbers_in_grid_bucket, d_nl,
    cutoff_skin_square);

  Delete_Excluded_Atoms_Serial_In_Neighbor_List<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, d_nl, excluded_list_start, excluded_list, excluded_numbers);
 }

 __global__ void Mul_half(float *src, float *dst) {
  int index = threadIdx.x;
  if (index < 3) {
    dst[index] = src[index] * 0.5;
  }
 }

 void Neighbor_List_Update(int grid_numbers, int atom_numbers, int refresh_count, int refresh_interval,
                          int not_first_time, float skin, int Nxy, float cutoff_square, float cutoff_with_skin_square,
                          int *grid_N, float *box_length, int *atom_numbers_in_grid_bucket, float *grid_length_inverse,
                          int *atom_in_grid_serial, GRID_BUCKET *bucket, float *crd, float *old_crd,
                          float *crd_to_uint_crd_cof, float *half_crd_to_uint_crd_cof, unsigned int *uint_crd,
                          float *uint_dr_to_dr_cof, GRID_POINTER *gpointer, NEIGHBOR_LIST *d_nl,
                          int *excluded_list_start, int *excluded_list, int *excluded_numbers, float half_skin_square,
                          int *is_need_refresh_neighbor_list, cudaStream_t stream) {
  if (not_first_time) {
    if (refresh_interval > 0) {
      if (refresh_count % refresh_interval == 0) {
        Mul_half<<<1, 3, 0, stream>>>(crd_to_uint_crd_cof, half_crd_to_uint_crd_cof);
        Refresh_Neighbor_List_No_Check(
          grid_numbers, atom_numbers, skin, Nxy, cutoff_square, grid_N, box_length, atom_numbers_in_grid_bucket,
          grid_length_inverse, atom_in_grid_serial, bucket, reinterpret_cast<VECTOR *>(crd),
          reinterpret_cast<VECTOR *>(old_crd), crd_to_uint_crd_cof, reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd),
          uint_dr_to_dr_cof, gpointer, d_nl, excluded_list_start, excluded_list, excluded_numbers, stream);
      }
      refresh_count += 1;
    } else {
      Is_need_refresh_neighbor_list_cuda<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
        atom_numbers, reinterpret_cast<VECTOR *>(crd), reinterpret_cast<VECTOR *>(old_crd), half_skin_square,
        is_need_refresh_neighbor_list);
      Mul_half<<<1, 3, 0, stream>>>(crd_to_uint_crd_cof, half_crd_to_uint_crd_cof);
      Refresh_Neighbor_List(is_need_refresh_neighbor_list, 32, atom_numbers, reinterpret_cast<VECTOR *>(crd),
                            reinterpret_cast<VECTOR *>(old_crd), reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd),
                            half_crd_to_uint_crd_cof, uint_dr_to_dr_cof, atom_in_grid_serial, skin, box_length,
                            gpointer, bucket, atom_numbers_in_grid_bucket, d_nl, excluded_list_start, excluded_list,
                            excluded_numbers, cutoff_with_skin_square, grid_numbers, grid_length_inverse, grid_N, Nxy,
                            stream);
    }
  } else {
    Mul_half<<<1, 3, 0, stream>>>(crd_to_uint_crd_cof, half_crd_to_uint_crd_cof);
    Refresh_Neighbor_List_First_Time(
      is_need_refresh_neighbor_list, 32, atom_numbers, reinterpret_cast<VECTOR *>(crd),
      reinterpret_cast<VECTOR *>(old_crd), reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd), half_crd_to_uint_crd_cof,
      uint_dr_to_dr_cof, atom_in_grid_serial, skin, box_length, gpointer, bucket, atom_numbers_in_grid_bucket, d_nl,
      excluded_list_start, excluded_list, excluded_numbers, cutoff_with_skin_square, grid_numbers, grid_length_inverse,
      grid_N, Nxy, stream);
  }
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh
@@ -0,0 +1,58 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NEIGHBOR_LIST_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NEIGHBOR_LIST_IMPL_H_

 struct VECTOR {
  float x;
  float y;
  float z;
 };
 struct INT_VECTOR {
  int int_x;
  int int_y;
  int int_z;
 };
 struct UNSIGNED_INT_VECTOR {
  unsigned int uint_x;
  unsigned int uint_y;
  unsigned int uint_z;
 };
 struct NEIGHBOR_LIST {
  int atom_numbers;
  int *atom_serial;
 };
 struct GRID_BUCKET {
  int *atom_serial;
 };
 struct GRID_POINTER {
  int *grid_serial;
 };

 void Construct_Neighbor_List(int grid_numbers, int max_neighbor_numbers, int *nl_atom_numbers, int *nl_atom_serial,
                             NEIGHBOR_LIST *nl, cudaStream_t stream);

 void Neighbor_List_Update(int grid_numbers, int atom_numbers, int refresh_count, int refresh_interval,
                          int not_first_time, float skin, int Nxy, float cutoff_square, float cutoff_with_skin_square,
                          int *grid_N, float *box_length, int *atom_numbers_in_grid_bucket, float *grid_length_inverse,
                          int *atom_in_grid_serial, GRID_BUCKET *bucket, float *crd, float *old_crd,
                          float *crd_to_uint_crd_cof, float *half_crd_to_uint_crd_cof, unsigned int *uint_crd,
                          float *uint_dr_to_dr_cof, GRID_POINTER *gpointer, NEIGHBOR_LIST *d_nl,
                          int *excluded_list_start, int *excluded_list, int *excluded_numbers, float half_skin_square,
                          int *is_need_refresh_neighbor_list, cudaStream_t stream);

 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cu
@@ -0,0 +1,139 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"

 __global__ void MD_Iteration_Leap_Frog_With_LiuJian(const int atom_numbers, const float half_dt, const float dt,
                                                    const float exp_gamma, const float *inverse_mass,
                                                    const float *sqrt_mass_inverse, VECTOR *vel, VECTOR *crd,
                                                    VECTOR *frc, VECTOR *acc, VECTOR *random_frc) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < atom_numbers) {
    acc[i].x = inverse_mass[i] * frc[i].x;
    acc[i].y = inverse_mass[i] * frc[i].y;
    acc[i].z = inverse_mass[i] * frc[i].z;

    vel[i].x = vel[i].x + dt * acc[i].x;
    vel[i].y = vel[i].y + dt * acc[i].y;
    vel[i].z = vel[i].z + dt * acc[i].z;

    crd[i].x = crd[i].x + half_dt * vel[i].x;
    crd[i].y = crd[i].y + half_dt * vel[i].y;
    crd[i].z = crd[i].z + half_dt * vel[i].z;

    vel[i].x = exp_gamma * vel[i].x + sqrt_mass_inverse[i] * random_frc[i].x;
    vel[i].y = exp_gamma * vel[i].y + sqrt_mass_inverse[i] * random_frc[i].y;
    vel[i].z = exp_gamma * vel[i].z + sqrt_mass_inverse[i] * random_frc[i].z;

    crd[i].x = crd[i].x + half_dt * vel[i].x;
    crd[i].y = crd[i].y + half_dt * vel[i].y;
    crd[i].z = crd[i].z + half_dt * vel[i].z;

    frc[i].x = 0.;
    frc[i].y = 0.;
    frc[i].z = 0.;
  }
 }

 __global__ void MD_Iteration_Leap_Frog_With_LiuJian_With_Max_Velocity(const int atom_numbers, const float half_dt,
                                                                      const float dt, const float exp_gamma,
                                                                      const float *inverse_mass,
                                                                      const float *sqrt_mass_inverse, VECTOR *vel,
                                                                      VECTOR *crd, VECTOR *frc, VECTOR *acc,
                                                                      VECTOR *random_frc, const float max_vel) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  float abs_vel;
  if (i < atom_numbers) {
    acc[i].x = inverse_mass[i] * frc[i].x;
    acc[i].y = inverse_mass[i] * frc[i].y;
    acc[i].z = inverse_mass[i] * frc[i].z;

    vel[i].x = vel[i].x + dt * acc[i].x;
    vel[i].y = vel[i].y + dt * acc[i].y;
    vel[i].z = vel[i].z + dt * acc[i].z;

    abs_vel = norm3df(vel[i].x, vel[i].y, vel[i].z);
    if (abs_vel < max_vel) {
    } else {
      abs_vel = max_vel / abs_vel;
      vel[i].x = abs_vel * vel[i].x;
      vel[i].y = abs_vel * vel[i].y;
      vel[i].z = abs_vel * vel[i].z;
    }

    crd[i].x = crd[i].x + half_dt * vel[i].x;
    crd[i].y = crd[i].y + half_dt * vel[i].y;
    crd[i].z = crd[i].z + half_dt * vel[i].z;

    vel[i].x = exp_gamma * vel[i].x + sqrt_mass_inverse[i] * random_frc[i].x;
    vel[i].y = exp_gamma * vel[i].y + sqrt_mass_inverse[i] * random_frc[i].y;
    vel[i].z = exp_gamma * vel[i].z + sqrt_mass_inverse[i] * random_frc[i].z;

    crd[i].x = crd[i].x + half_dt * vel[i].x;
    crd[i].y = crd[i].y + half_dt * vel[i].y;
    crd[i].z = crd[i].z + half_dt * vel[i].z;

    frc[i].x = 0.;
    frc[i].y = 0.;
    frc[i].z = 0.;
  }
 }

 void MDIterationLeapFrog(const int float4_numbers, const int atom_numbers, const float half_dt, const float dt,
                         const float exp_gamma, const int is_max_velocity, const float max_velocity,
                         const float *d_mass_inverse, const float *d_sqrt_mass, float *vel_f, float *crd_f,
                         float *frc_f, float *acc_f, cudaStream_t stream) {
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128>>>(3 * atom_numbers, vel_f, 0.);
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128>>>(3 * atom_numbers, crd_f, 0.);
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128>>>(3 * atom_numbers, frc_f, 0.);
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128>>>(3 * atom_numbers, acc_f, 0.);

  VECTOR *frc = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(frc_f));
  VECTOR *vel = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(vel_f));
  VECTOR *acc = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(acc_f));
  VECTOR *crd = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(crd_f));

  curandStatePhilox4_32_10_t *rand_state;
  VECTOR *random_force;

  Cuda_Malloc_Safely(reinterpret_cast<void **>(&random_force), sizeof(float4) * float4_numbers);
  Cuda_Malloc_Safely(reinterpret_cast<void **>(&rand_state), sizeof(curandStatePhilox4_32_10_t) * float4_numbers);
  Setup_Rand_Normal_Kernel<<<ceilf(static_cast<float>(float4_numbers) / 32.), 32>>>(float4_numbers, rand_state, 1);
  Rand_Normal<<<ceilf(static_cast<float>(float4_numbers) / 32.), 32, 0, stream>>>(
    float4_numbers, rand_state, reinterpret_cast<float4 *>(random_force));

  if (!is_max_velocity) {
    MD_Iteration_Leap_Frog_With_LiuJian<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
      atom_numbers, half_dt, dt, exp_gamma, d_mass_inverse, d_sqrt_mass, vel, crd, frc, acc, random_force);
  } else {
    MD_Iteration_Leap_Frog_With_LiuJian_With_Max_Velocity<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0,
                                                            stream>>>(atom_numbers, half_dt, dt, exp_gamma,
                                                                      d_mass_inverse, d_sqrt_mass, vel, crd, frc, acc,
                                                                      random_force, max_velocity);

    cudaStreamSynchronize(stream);
    cudaFree(random_force);
    cudaFree(rand_state);

    return;
  }
 }

 void MDIterationLeapFrog(const int float4_numbers, const int atom_numbers, const float half_dt, const float dt,
                         const float exp_gamma, const int is_max_velocity, const float max_velocity,
                         const float *d_mass_inverse, const float *d_sqrt_mass, float *vel_f, float *crd_f,
                         float *frc_f, float *acc_f, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh
@@ -0,0 +1,27 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NVTIT_MD_ITERATION_LEAP_FROG_IMPL_H
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NVTIT_MD_ITERATION_LEAP_FROG_IMPL_H

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void MDIterationLeapFrog(const int float4_numbers, const int atom_numbers, const float half_dt, const float dt,
                         const float exp_gamma, const int is_max_velocity, const float max_velocity,
                         const float *d_mass_inverse, const float *d_sqrt_mass, float *vel_f, float *crd_f,
                         float *frc_f, float *acc_f, cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NVTIT_MD_ITERATION_LEAP_FROG_IMPL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_common.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_common.cuh
@@ -0,0 +1,230 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_COMMON_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_COMMON_H_
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
 __constant__ float PME_Ma[4] = {1.0 / 6.0, -0.5, 0.5, -1.0 / 6.0};
 __constant__ float PME_Mb[4] = {0, 0.5, -1, 0.5};
 __constant__ float PME_Mc[4] = {0, 0.5, 0, -0.5};
 __constant__ float PME_Md[4] = {0, 1.0 / 6.0, 4.0 / 6.0, 1.0 / 6.0};
 __constant__ float PME_dMa[4] = {0.5, -1.5, 1.5, -0.5};
 __constant__ float PME_dMb[4] = {0, 1, -2, 1};
 __constant__ float PME_dMc[4] = {0, 0.5, 0, -0.5};
 #define PI 3.1415926
 const float periodic_factor_inverse = 2.32830643e-10;
 static dim3 thread_PME;

 const float cutoff = 10.0;
 const float tolerance = 0.00001;

 static float M_(float u, int n) {
  if (n == 2) {
    if (u > 2 || u < 0) return 0;
    return 1 - abs(u - 1);
  } else {
    return u / (n - 1) * M_(u, n - 1) + (n - u) / (n - 1) * M_(u - 1, n - 1);
  }
 }

 static float Get_Beta(float cutoff, float tolerance) {
  float beta, low, high, tempf;
  int ilow, ihigh;

  high = 1.0;
  ihigh = 1;

  while (1) {
    tempf = erfc(high * cutoff) / cutoff;
    if (tempf <= tolerance) break;
    high *= 2;
    ihigh++;
  }

  ihigh += 50;
  low = 0.0;
  for (ilow = 1; ilow < ihigh; ilow++) {
    beta = (low + high) / 2;
    tempf = erfc(beta * cutoff) / cutoff;
    if (tempf >= tolerance)
      low = beta;
    else
      high = beta;
  }
  return beta;
 }

 static cufftComplex expc(cufftComplex z) {
  cufftComplex res;
  float t = expf(z.x);
  sincosf(z.y, &res.y, &res.x);
  res.x *= t;
  res.y *= t;
  return res;
 }

 static float getb(int k, int NFFT, int B_order) {
  cufftComplex tempc, tempc2, res;
  float tempf;
  tempc2.x = 0;
  tempc2.y = 0;

  tempc.x = 0;
  tempc.y = 2 * (B_order - 1) * PI * k / NFFT;
  res = expc(tempc);

  for (int kk = 0; kk < (B_order - 1); kk++) {
    tempc.x = 0;
    tempc.y = 2 * PI * k / NFFT * kk;
    tempc = expc(tempc);
    tempf = M_(kk + 1, B_order);
    tempc2.x += tempf * tempc.x;
    tempc2.y += tempf * tempc.y;
  }
  res = cuCdivf(res, tempc2);
  return res.x * res.x + res.y * res.y;
 }

 __global__ static void PME_Atom_Near(const UNSIGNED_INT_VECTOR *uint_crd, int *PME_atom_near, const int PME_Nin,
                                     const float periodic_factor_inverse_x, const float periodic_factor_inverse_y,
                                     const float periodic_factor_inverse_z, const int atom_numbers, const int fftx,
                                     const int ffty, const int fftz, const UNSIGNED_INT_VECTOR *PME_kxyz,
                                     UNSIGNED_INT_VECTOR *PME_uxyz, VECTOR *PME_frxyz) {
  int atom = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom < atom_numbers) {
    UNSIGNED_INT_VECTOR *temp_uxyz = &PME_uxyz[atom];
    int k, tempux, tempuy, tempuz;
    float tempf;
    tempf = static_cast<float> (uint_crd[atom].uint_x) * periodic_factor_inverse_x;
    tempux = static_cast<int> (tempf);
    PME_frxyz[atom].x = tempf - tempux;

    tempf = static_cast<float> (uint_crd[atom].uint_y) * periodic_factor_inverse_y;
    tempuy = static_cast<int> (tempf);
    PME_frxyz[atom].y = tempf - tempuy;

    tempf = static_cast<float> (uint_crd[atom].uint_z) * periodic_factor_inverse_z;
    tempuz = static_cast<int> (tempf);
    PME_frxyz[atom].z = tempf - tempuz;

    if (tempux != (*temp_uxyz).uint_x || tempuy != (*temp_uxyz).uint_y || tempuz != (*temp_uxyz).uint_z) {
      (*temp_uxyz).uint_x = tempux;
      (*temp_uxyz).uint_y = tempuy;
      (*temp_uxyz).uint_z = tempuz;
      int *temp_near = PME_atom_near + atom * 64;
      int kx, ky, kz;
      for (k = 0; k < 64; k++) {
        UNSIGNED_INT_VECTOR temp_kxyz = PME_kxyz[k];
        kx = tempux - temp_kxyz.uint_x;
        if (kx < 0) kx += fftx;
        ky = tempuy - temp_kxyz.uint_y;
        if (ky < 0) ky += ffty;
        kz = tempuz - temp_kxyz.uint_z;
        if (kz < 0) kz += fftz;
        temp_near[k] = kx * PME_Nin + ky * fftz + kz;
      }
    }
  }
 }

 __global__ static void PME_Q_Spread(int *PME_atom_near, const float *charge, const VECTOR *PME_frxyz, float *PME_Q,
                                    const UNSIGNED_INT_VECTOR *PME_kxyz, const int atom_numbers) {
  int atom = blockDim.x * blockIdx.x + threadIdx.x;

  if (atom < atom_numbers) {
    int k;
    float tempf, tempQ, tempf2;

    int *temp_near = PME_atom_near + atom * 64;
    VECTOR temp_frxyz = PME_frxyz[atom];
    float tempcharge = charge[atom];

    UNSIGNED_INT_VECTOR temp_kxyz;
    unsigned int kx;

    for (k = threadIdx.y; k < 64; k = k + blockDim.y) {
      temp_kxyz = PME_kxyz[k];
      kx = temp_kxyz.uint_x;
      tempf = (temp_frxyz.x);
      tempf2 = tempf * tempf;
      tempf = PME_Ma[kx] * tempf * tempf2 + PME_Mb[kx] * tempf2 + PME_Mc[kx] * tempf + PME_Md[kx];

      tempQ = tempcharge * tempf;

      kx = temp_kxyz.uint_y;
      tempf = (temp_frxyz.y);
      tempf2 = tempf * tempf;
      tempf = PME_Ma[kx] * tempf * tempf2 + PME_Mb[kx] * tempf2 + PME_Mc[kx] * tempf + PME_Md[kx];

      tempQ = tempQ * tempf;

      kx = temp_kxyz.uint_z;
      tempf = (temp_frxyz.z);
      tempf2 = tempf * tempf;
      tempf = PME_Ma[kx] * tempf * tempf2 + PME_Mb[kx] * tempf2 + PME_Mc[kx] * tempf + PME_Md[kx];
      tempQ = tempQ * tempf;

      atomicAdd(&PME_Q[temp_near[k]], tempQ);
    }
  }
 }

 __global__ static void PME_Direct_Energy(const int atom_numbers, const NEIGHBOR_LIST *nl,
                                         const UNSIGNED_INT_VECTOR *uint_crd, const VECTOR *boxlength,
                                         const float *charge, const float beta, const float cutoff_square,
                                         float *direct_ene) {
  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom_i < atom_numbers) {
    NEIGHBOR_LIST nl_i = nl[atom_i];
    int N = nl_i.atom_numbers;
    int atom_j;
    int int_x;
    int int_y;
    int int_z;
    UNSIGNED_INT_VECTOR r1 = uint_crd[atom_i], r2;
    VECTOR dr;
    float dr2;
    float dr_abs;
    // float dr_inverse;
    float ene_temp;
    float charge_i = charge[atom_i];
    float ene_lin = 0.;

    // int x, y;
    // int atom_pair_LJ_type;
    for (int j = threadIdx.y; j < N; j = j + blockDim.y) {
      atom_j = nl_i.atom_serial[j];
      r2 = uint_crd[atom_j];

      int_x = r2.uint_x - r1.uint_x;
      int_y = r2.uint_y - r1.uint_y;
      int_z = r2.uint_z - r1.uint_z;
      dr.x = boxlength[0].x * int_x;
      dr.y = boxlength[0].y * int_y;
      dr.z = boxlength[0].z * int_z;

      dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
      if (dr2 < cutoff_square) {
        dr_abs = norm3df(dr.x, dr.y, dr.z);
        ene_temp = charge_i * charge[atom_j] * erfcf(beta * dr_abs) / dr_abs;
        ene_lin = ene_lin + ene_temp;
      }
    }
    atomicAdd(direct_ene, ene_lin);
  }
 }

 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_impl.cu
@@ -0,0 +1,234 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_common.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"

 __global__ void PME_Energy_Product(const int element_number, const float *list1, const float *list2, float *sum) {
  if (threadIdx.x == 0) {
    sum[0] = 0.;
  }
  __syncthreads();
  float lin = 0.0;
  for (int i = threadIdx.x; i < element_number; i = i + blockDim.x) {
    lin = lin + list1[i] * list2[i];
  }
  atomicAdd(sum, lin);
 }

 __global__ void PME_Energy_Reciprocal(const int element_number, const cufftComplex *FQ, const float *BC, float *sum) {
  if (threadIdx.x == 0) {
    sum[0] = 0.;
  }
  __syncthreads();
  float lin = 0.0;
  cufftComplex FQ_i;
  for (int i = threadIdx.x; i < element_number; i = i + blockDim.x) {
    FQ_i = FQ[i];
    lin = lin + (FQ_i.x * FQ_i.x + FQ_i.y * FQ_i.y) * BC[i];
  }
  atomicAdd(sum, lin);
 }

 __global__ void PME_Excluded_Energy_Correction(const int atom_numbers, const UNSIGNED_INT_VECTOR *uint_crd,
                                               const VECTOR *sacler, const float *charge, const float pme_beta,
                                               const float sqrt_pi, const int *excluded_list_start,
                                               const int *excluded_list, const int *excluded_atom_numbers, float *ene) {
  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom_i < atom_numbers) {
    int excluded_number = excluded_atom_numbers[atom_i];
    if (excluded_number > 0) {
      int list_start = excluded_list_start[atom_i];
      // int atom_min = excluded_list[list_start];
      int list_end = list_start + excluded_number;
      int atom_j;
      int int_x;
      int int_y;
      int int_z;

      float charge_i = charge[atom_i];
      float charge_j;
      float dr_abs;
      float beta_dr;

      UNSIGNED_INT_VECTOR r1 = uint_crd[atom_i], r2;
      VECTOR dr;
      float dr2;

      float ene_lin = 0.;

      for (int i = list_start; i < list_end; i = i + 1) {
        atom_j = excluded_list[i];
        r2 = uint_crd[atom_j];
        charge_j = charge[atom_j];

        int_x = r2.uint_x - r1.uint_x;
        int_y = r2.uint_y - r1.uint_y;
        int_z = r2.uint_z - r1.uint_z;
        dr.x = sacler[0].x * int_x;
        dr.y = sacler[0].y * int_y;
        dr.z = sacler[0].z * int_z;
        dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;

        dr_abs = sqrtf(dr2);
        beta_dr = pme_beta * dr_abs;

        ene_lin -= charge_i * charge_j * erff(beta_dr) / dr_abs;
      }
      atomicAdd(ene, ene_lin);
    }
  }
 }

 void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *box_length_f, float *PME_BC,
               int *pme_uxyz, float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
               const int *uint_crd_f, const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl,
               const float *scaler_f, const int *excluded_list_start, const int *excluded_list,
               const int *excluded_atom_numbers, float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene,
               float *d_correction_ene, cudaStream_t stream) {
  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));
  VECTOR *scaler = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(scaler_f));
  int max_neighbor_numbers = 800;
  NEIGHBOR_LIST *nl_a = reinterpret_cast<NEIGHBOR_LIST *>(nl);
  construct_neighbor_list_kernel<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
    atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl_a);
  std::vector<float> h_box_length(3);
  cudaMemcpyAsync(h_box_length.data(), box_length_f, sizeof(float) * h_box_length.size(), cudaMemcpyDeviceToHost,
                  stream);
  cudaStreamSynchronize(stream);
  VECTOR *box_length = reinterpret_cast<VECTOR *>(h_box_length.data());

  UNSIGNED_INT_VECTOR *PME_uxyz = reinterpret_cast<UNSIGNED_INT_VECTOR *>(pme_uxyz);
  UNSIGNED_INT_VECTOR *PME_kxyz = reinterpret_cast<UNSIGNED_INT_VECTOR *>(pme_kxyz);
  VECTOR *PME_frxyz = reinterpret_cast<VECTOR *>(pme_frxyz);
  cufftComplex *PME_FQ = reinterpret_cast<cufftComplex *>(pme_fq);
  cufftHandle PME_plan_r2c;
  cufftHandle PME_plan_c2r;
  cufftPlan3d(&PME_plan_r2c, fftx, ffty, fftz, CUFFT_R2C);
  cufftPlan3d(&PME_plan_c2r, fftx, ffty, fftz, CUFFT_C2R);
  cufftSetStream(PME_plan_r2c, stream);
  cufftSetStream(PME_plan_c2r, stream);
  thread_PME.x = 8;
  thread_PME.y = 8;
  int PME_Nin = ffty * fftz;
  int PME_Nfft = fftx * ffty * (fftz / 2 + 1);
  int PME_Nall = fftx * ffty * fftz;
  float volume = box_length[0].x * box_length[0].y * box_length[0].z;

  UNSIGNED_INT_VECTOR *PME_kxyz_cpu;
  Malloc_Safely(reinterpret_cast<void **>(&PME_kxyz_cpu), sizeof(UNSIGNED_INT_VECTOR) * 64);

  int kx, ky, kz, kxrp, kyrp, kzrp, index;
  for (kx = 0; kx < 4; kx++) {
    for (ky = 0; ky < 4; ky++) {
      for (kz = 0; kz < 4; kz++) {
        index = kx * 16 + ky * 4 + kz;
        PME_kxyz_cpu[index].uint_x = kx;
        PME_kxyz_cpu[index].uint_y = ky;
        PME_kxyz_cpu[index].uint_z = kz;
      }
    }
  }
  cudaMemcpyAsync(PME_kxyz, PME_kxyz_cpu, sizeof(UNSIGNED_INT_VECTOR) * 64, cudaMemcpyHostToDevice, stream);
  cudaStreamSynchronize(stream);
  free(PME_kxyz_cpu);

  // initial start
  float *B1, *B2, *B3, *PME_BC0;
  B1 = reinterpret_cast<float *>(malloc(sizeof(float) * fftx));
  B2 = reinterpret_cast<float *>(malloc(sizeof(float) * ffty));
  B3 = reinterpret_cast<float *>(malloc(sizeof(float) * fftz));
  PME_BC0 = reinterpret_cast<float *>(malloc(sizeof(float) * PME_Nfft));

  for (kx = 0; kx < fftx; kx++) {
    B1[kx] = getb(kx, fftx, 4);
  }

  for (ky = 0; ky < ffty; ky++) {
    B2[ky] = getb(ky, ffty, 4);
  }

  for (kz = 0; kz < fftz; kz++) {
    B3[kz] = getb(kz, fftz, 4);
  }
  float mprefactor = PI * PI / -beta / beta;

  float msq;
  for (kx = 0; kx < fftx; kx++) {
    kxrp = kx;
    if (kx > fftx / 2) kxrp = fftx - kx;
    for (ky = 0; ky < ffty; ky++) {
      kyrp = ky;
      if (ky > ffty / 2) kyrp = ffty - ky;
      for (kz = 0; kz <= fftz / 2; kz++) {
        kzrp = kz;

        msq = kxrp * kxrp / box_length[0].x / box_length[0].x + kyrp * kyrp / box_length[0].y / box_length[0].y +
              kzrp * kzrp / box_length[0].z / box_length[0].z;
        index = kx * ffty * (fftz / 2 + 1) + ky * (fftz / 2 + 1) + kz;
        if ((kx + ky + kz) == 0) {
          PME_BC0[index] = 0;
        } else {
          PME_BC0[index] = 1.0 / PI / msq * exp(mprefactor * msq) / volume;
        }

        PME_BC0[index] *= B1[kx] * B2[ky] * B3[kz];
      }
    }
  }

  cudaMemcpyAsync(PME_BC, PME_BC0, sizeof(float) * PME_Nfft, cudaMemcpyHostToDevice, stream);
  cudaStreamSynchronize(stream);
  free(B1);
  free(B2);
  free(B3);
  free(PME_BC0);

  Reset_List<<<3 * atom_numbers / 32 + 1, 32, 0, stream>>>(3 * atom_numbers, reinterpret_cast<int *>(PME_uxyz),
                                                           1 << 30);
  PME_Atom_Near<<<atom_numbers / 32 + 1, 32, 0, stream>>>(
    uint_crd, PME_atom_near, PME_Nin, periodic_factor_inverse * fftx, periodic_factor_inverse * ffty,
    periodic_factor_inverse * fftz, atom_numbers, fftx, ffty, fftz, PME_kxyz, PME_uxyz, PME_frxyz);

  Reset_List<<<PME_Nall / 1024 + 1, 1024, 0, stream>>>(PME_Nall, PME_Q, 0);

  PME_Q_Spread<<<atom_numbers / thread_PME.x + 1, thread_PME, 0, stream>>>(PME_atom_near, charge, PME_frxyz, PME_Q,
                                                                           PME_kxyz, atom_numbers);

  cufftExecR2C(PME_plan_r2c, reinterpret_cast<float *>(PME_Q), reinterpret_cast<cufftComplex *>(PME_FQ));

  PME_Energy_Reciprocal<<<1, 1024, 0, stream>>>(PME_Nfft, PME_FQ, PME_BC, d_reciprocal_ene);

  PME_Energy_Product<<<1, 1024, 0, stream>>>(atom_numbers, charge, charge, d_self_ene);
  Scale_List<<<1, 1, 0, stream>>>(1, d_self_ene, -beta / sqrtf(PI));

  Reset_List<<<1, 1, 0, stream>>>(1, d_direct_ene, 0.0);
  PME_Direct_Energy<<<atom_numbers / thread_PME.x + 1, thread_PME, 0, stream>>>(
    atom_numbers, nl_a, uint_crd, scaler, charge, beta, cutoff * cutoff, d_direct_ene);

  Reset_List<<<1, 1, 0, stream>>>(1, d_correction_ene, 0.0);
  PME_Excluded_Energy_Correction<<<atom_numbers / 32 + 1, 32, 0, stream>>>(
    atom_numbers, uint_crd, scaler, charge, beta, sqrtf(PI), excluded_list_start, excluded_list, excluded_atom_numbers,
    d_correction_ene);
  return;
 }
 void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *box_length_f, float *PME_BC,
               int *pme_uxyz, float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
               const int *uint_crd_f, const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl,
               const float *scaler_f, const int *excluded_list_start, const int *excluded_list,
               const int *excluded_atom_numbers, float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene,
               float *d_correction_ene, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_impl.cuh
@@ -0,0 +1,30 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_ENERGY_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_ENERGY_IMPL_H_

 #include <curand_kernel.h>
 #include <vector>
 #include "runtime/device/gpu/cuda_common.h"

 void PMEEnergy(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *box_length_f, float *PME_BC,
               int *pme_uxyz, float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
               const int *uint_crd_f, const float *charge, int *nl_atom_numbers, int *nl_atom_serial, int *nl,
               const float *scaler_f, const int *excluded_list_start, const int *excluded_list,
               const int *excluded_atom_numbers, float *d_reciprocal_ene, float *d_self_ene, float *d_direct_ene,
               float *d_correction_ene, cudaStream_t stream);

 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_excluded_force_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_excluded_force_impl.cu
@@ -0,0 +1,102 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_common.cuh"

 __global__ void PME_Excluded_Force_Correction(const int atom_numbers, const UNSIGNED_INT_VECTOR *uint_crd,
                                              const VECTOR *sacler, const float *charge, const float pme_beta,
                                              const float sqrt_pi, const int *excluded_list_start,
                                              const int *excluded_list, const int *excluded_atom_numbers, VECTOR *frc) {
  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom_i < atom_numbers) {
    int excluded_numbers = excluded_atom_numbers[atom_i];
    if (excluded_numbers > 0) {
      int list_start = excluded_list_start[atom_i];
      // int atom_min = excluded_list[list_start];
      int list_end = list_start + excluded_numbers;
      int atom_j;
      int int_x;
      int int_y;
      int int_z;

      float charge_i = charge[atom_i];
      float charge_j;
      float dr_abs;
      float beta_dr;

      UNSIGNED_INT_VECTOR r1 = uint_crd[atom_i], r2;
      VECTOR dr;
      float dr2;

      float frc_abs = 0.;
      VECTOR frc_lin;
      VECTOR frc_record = {0., 0., 0.};

      for (int i = list_start; i < list_end; i = i + 1) {
        atom_j = excluded_list[i];
        r2 = uint_crd[atom_j];
        charge_j = charge[atom_j];

        int_x = r2.uint_x - r1.uint_x;
        int_y = r2.uint_y - r1.uint_y;
        int_z = r2.uint_z - r1.uint_z;
        dr.x = sacler[0].x * int_x;
        dr.y = sacler[0].y * int_y;
        dr.z = sacler[0].z * int_z;
        dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;

        dr_abs = sqrtf(dr2);
        beta_dr = pme_beta * dr_abs;
        // sqrt_pi= 2/sqrt(3.141592654);
        frc_abs = beta_dr * sqrt_pi * expf(-beta_dr * beta_dr) + erfcf(beta_dr);
        frc_abs = (frc_abs - 1.) / dr2 / dr_abs;
        frc_abs = -charge_i * charge_j * frc_abs;
        frc_lin.x = frc_abs * dr.x;
        frc_lin.y = frc_abs * dr.y;
        frc_lin.z = frc_abs * dr.z;

        frc_record.x = frc_record.x + frc_lin.x;
        frc_record.y = frc_record.y + frc_lin.y;
        frc_record.z = frc_record.z + frc_lin.z;

        atomicAdd(&frc[atom_j].x, -frc_lin.x);
        atomicAdd(&frc[atom_j].y, -frc_lin.y);
        atomicAdd(&frc[atom_j].z, -frc_lin.z);
      }  // atom_j cycle
      atomicAdd(&frc[atom_i].x, frc_record.x);
      atomicAdd(&frc[atom_i].y, frc_record.y);
      atomicAdd(&frc[atom_i].z, frc_record.z);
    }  // if need excluded
  }
 }

 void PMEExcludedForce(const int atom_numbers, const float pme_beta, const int *uint_crd_f, const float *sacler_f,
                      const float *charge, const int *excluded_list_start, const int *excluded_list,
                      const int *excluded_atom_numbers, float *frc_f, cudaStream_t stream) {
  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));
  VECTOR *frc = reinterpret_cast<VECTOR *>(frc_f);
  VECTOR *sacler = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(sacler_f));

  PME_Excluded_Force_Correction<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
    atom_numbers, uint_crd, sacler, charge, pme_beta, TWO_DIVIDED_BY_SQRT_PI, excluded_list_start, excluded_list,
    excluded_atom_numbers, frc);
  return;
 }

 void PMEExcludedForce(const int atom_numbers, const float pme_beta, const int *uint_crd_f, const float *sacler_f,
                      const float *charge, const int *excluded_list_start, const int *excluded_list,
                      const int *excluded_atom_numbers, float *frc_f, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh
@@ -0,0 +1,26 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_EXCLUDED_FORCE_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_EXCLUDED_FORCE_IMPL_H_

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void PMEExcludedForce(const int atom_numbers, const float pme_beta, const int *uint_crd_f, const float *sacler_f,
                      const float *charge, const int *excluded_list_start, const int *excluded_list,
                      const int *excluded_atom_numbers, float *frc_f, cudaStream_t stream);

 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cu
@@ -0,0 +1,204 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_common.cuh"

 __global__ void PME_BCFQ(cufftComplex *PME_FQ, float *PME_BC, int PME_Nfft) {
  int index = blockDim.x * blockIdx.x + threadIdx.x;
  if (index < PME_Nfft) {
    float tempf = PME_BC[index];
    cufftComplex tempc = PME_FQ[index];
    PME_FQ[index].x = tempc.x * tempf;
    PME_FQ[index].y = tempc.y * tempf;
  }
 }

 __global__ void PME_Final(int *PME_atom_near, const float *charge, const float *PME_Q, VECTOR *force,
                          const VECTOR *PME_frxyz, const UNSIGNED_INT_VECTOR *PME_kxyz,
                          const VECTOR PME_inverse_box_vector, const int atom_numbers) {
  int atom = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom < atom_numbers) {
    int k, kx;
    float tempdQx, tempdQy, tempdQz, tempdx, tempdy, tempdz, tempx, tempy, tempz, tempdQf;
    float tempf, tempf2;
    float temp_charge = charge[atom];
    int *temp_near = PME_atom_near + atom * 64;
    UNSIGNED_INT_VECTOR temp_kxyz;
    VECTOR temp_frxyz = PME_frxyz[atom];
    for (k = threadIdx.y; k < 64; k = k + blockDim.y) {
      temp_kxyz = PME_kxyz[k];
      tempdQf = -PME_Q[temp_near[k]] * temp_charge;

      kx = temp_kxyz.uint_x;
      tempf = (temp_frxyz.x);
      tempf2 = tempf * tempf;
      tempx = PME_Ma[kx] * tempf * tempf2 + PME_Mb[kx] * tempf2 + PME_Mc[kx] * tempf + PME_Md[kx];
      tempdx = PME_dMa[kx] * tempf2 + PME_dMb[kx] * tempf + PME_dMc[kx];

      kx = temp_kxyz.uint_y;
      tempf = (temp_frxyz.y);
      tempf2 = tempf * tempf;
      tempy = PME_Ma[kx] * tempf * tempf2 + PME_Mb[kx] * tempf2 + PME_Mc[kx] * tempf + PME_Md[kx];
      tempdy = PME_dMa[kx] * tempf2 + PME_dMb[kx] * tempf + PME_dMc[kx];

      kx = temp_kxyz.uint_z;
      tempf = (temp_frxyz.z);
      tempf2 = tempf * tempf;
      tempz = PME_Ma[kx] * tempf * tempf2 + PME_Mb[kx] * tempf2 + PME_Mc[kx] * tempf + PME_Md[kx];
      tempdz = PME_dMa[kx] * tempf2 + PME_dMb[kx] * tempf + PME_dMc[kx];

      tempdQx = tempdx * tempy * tempz * PME_inverse_box_vector.x;
      tempdQy = tempdy * tempx * tempz * PME_inverse_box_vector.y;
      tempdQz = tempdz * tempx * tempy * PME_inverse_box_vector.z;

      atomicAdd(&force[atom].x, tempdQf * tempdQx);
      atomicAdd(&force[atom].y, tempdQf * tempdQy);
      atomicAdd(&force[atom].z, tempdQf * tempdQz);
    }
  }
 }

 void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
                        float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
                        const float *box_length_f, const int *uint_crd_f, const float *charge, float *force,
                        cudaStream_t stream) {
  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));
  UNSIGNED_INT_VECTOR *PME_uxyz = reinterpret_cast<UNSIGNED_INT_VECTOR *>(pme_uxyz);
  UNSIGNED_INT_VECTOR *PME_kxyz = reinterpret_cast<UNSIGNED_INT_VECTOR *>(pme_kxyz);
  Reset_List<<<3 * atom_numbers / 32 + 1, 32, 0, stream>>>(3 * atom_numbers, reinterpret_cast<int *>(PME_uxyz),
                                                           1 << 30);

  VECTOR *PME_frxyz = reinterpret_cast<VECTOR *>(pme_frxyz);
  VECTOR *frc = reinterpret_cast<VECTOR *>(force);

  std::vector<float> h_box_length(3);
  cudaMemcpyAsync(h_box_length.data(), box_length_f, sizeof(float) * h_box_length.size(), cudaMemcpyDeviceToHost,
                  stream);
  cudaStreamSynchronize(stream);
  VECTOR *box_length = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(h_box_length.data()));
  cufftComplex *PME_FQ = reinterpret_cast<cufftComplex *>(pme_fq);

  VECTOR PME_inverse_box_vector;
  PME_inverse_box_vector.x = static_cast<float>(fftx) / box_length[0].x;
  PME_inverse_box_vector.y = static_cast<float>(ffty) / box_length[0].y;
  PME_inverse_box_vector.z = static_cast<float>(fftz) / box_length[0].z;
  cufftHandle PME_plan_r2c;
  cufftHandle PME_plan_c2r;
  cufftPlan3d(&PME_plan_r2c, fftx, ffty, fftz, CUFFT_R2C);
  cufftPlan3d(&PME_plan_c2r, fftx, ffty, fftz, CUFFT_C2R);
  cufftSetStream(PME_plan_r2c, stream);
  cufftSetStream(PME_plan_c2r, stream);
  thread_PME.x = 8;
  thread_PME.y = 8;
  int PME_Nin = ffty * fftz;
  int PME_Nfft = fftx * ffty * (fftz / 2 + 1);
  int PME_Nall = fftx * ffty * fftz;
  float volume = box_length[0].x * box_length[0].y * box_length[0].z;

  UNSIGNED_INT_VECTOR *PME_kxyz_cpu;
  Malloc_Safely(reinterpret_cast<void **>(&PME_kxyz_cpu), sizeof(UNSIGNED_INT_VECTOR) * 64);

  int kx, ky, kz, kxrp, kyrp, kzrp, index;
  for (kx = 0; kx < 4; kx++) {
    for (ky = 0; ky < 4; ky++) {
      for (kz = 0; kz < 4; kz++) {
        index = kx * 16 + ky * 4 + kz;
        PME_kxyz_cpu[index].uint_x = kx;
        PME_kxyz_cpu[index].uint_y = ky;
        PME_kxyz_cpu[index].uint_z = kz;
      }
    }
  }
  cudaMemcpyAsync(PME_kxyz, PME_kxyz_cpu, sizeof(UNSIGNED_INT_VECTOR) * 64, cudaMemcpyHostToDevice, stream);
  cudaStreamSynchronize(stream);
  free(PME_kxyz_cpu);

  // initial start
  float *B1, *B2, *B3, *PME_BC0;
  B1 = reinterpret_cast<float *>(malloc(sizeof(float) * fftx));
  B2 = reinterpret_cast<float *>(malloc(sizeof(float) * ffty));
  B3 = reinterpret_cast<float *>(malloc(sizeof(float) * fftz));
  PME_BC0 = reinterpret_cast<float *>(malloc(sizeof(float) * PME_Nfft));

  for (kx = 0; kx < fftx; kx++) {
    B1[kx] = getb(kx, fftx, 4);
  }

  for (ky = 0; ky < ffty; ky++) {
    B2[ky] = getb(ky, ffty, 4);
  }

  for (kz = 0; kz < fftz; kz++) {
    B3[kz] = getb(kz, fftz, 4);
  }
  float mprefactor = PI * PI / -beta / beta;
  float msq;
  for (kx = 0; kx < fftx; kx++) {
    kxrp = kx;
    if (kx > fftx / 2) kxrp = fftx - kx;
    for (ky = 0; ky < ffty; ky++) {
      kyrp = ky;
      if (ky > ffty / 2) kyrp = ffty - ky;
      for (kz = 0; kz <= fftz / 2; kz++) {
        kzrp = kz;

        msq = kxrp * kxrp / box_length[0].x / box_length[0].x + kyrp * kyrp / box_length[0].y / box_length[0].y +
              kzrp * kzrp / box_length[0].z / box_length[0].z;
        index = kx * ffty * (fftz / 2 + 1) + ky * (fftz / 2 + 1) + kz;
        if ((kx + ky + kz) == 0) {
          PME_BC0[index] = 0;
        } else {
          PME_BC0[index] = 1.0 / PI / msq * exp(mprefactor * msq) / volume;
        }

        PME_BC0[index] *= B1[kx] * B2[ky] * B3[kz];
      }
    }
  }

  cudaMemcpyAsync(PME_BC, PME_BC0, sizeof(float) * PME_Nfft, cudaMemcpyHostToDevice, stream);
  cudaStreamSynchronize(stream);
  free(B1);
  free(B2);
  free(B3);
  free(PME_BC0);

  // initial end
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128, 0, stream>>>(
    3 * atom_numbers, reinterpret_cast<float *>(frc), 0.);
  PME_Atom_Near<<<atom_numbers / 32 + 1, 32, 0, stream>>>(
    uint_crd, PME_atom_near, PME_Nin, periodic_factor_inverse * fftx, periodic_factor_inverse * ffty,
    periodic_factor_inverse * fftz, atom_numbers, fftx, ffty, fftz, PME_kxyz, PME_uxyz, PME_frxyz);
  Reset_List<<<PME_Nall / 1024 + 1, 1024, 0, stream>>>(PME_Nall, PME_Q, 0);

  PME_Q_Spread<<<atom_numbers / thread_PME.x + 1, thread_PME, 0, stream>>>(PME_atom_near, charge, PME_frxyz, PME_Q,
                                                                           PME_kxyz, atom_numbers);

  cufftExecR2C(PME_plan_r2c, reinterpret_cast<float *>(PME_Q), reinterpret_cast<cufftComplex *>(PME_FQ));
  PME_BCFQ<<<PME_Nfft / 1024 + 1, 1024, 0, stream>>>(PME_FQ, PME_BC, PME_Nfft);

  cufftExecC2R(PME_plan_c2r, reinterpret_cast<cufftComplex *>(PME_FQ), reinterpret_cast<float *>(PME_Q));

  PME_Final<<<atom_numbers / thread_PME.x + 1, thread_PME, 0, stream>>>(PME_atom_near, charge, PME_Q, frc, PME_frxyz,
                                                                        PME_kxyz, PME_inverse_box_vector, atom_numbers);
  return;
 }

 void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
                        float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
                        const float *box_length_f, const int *uint_crd_f, const float *charge, float *force,
                        cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh
@@ -0,0 +1,28 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_RECIPROCAL_FORCE_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_PME_PME_RECIPROCAL_FORCE_IMPL_H_

 #include <curand_kernel.h>
 #include <vector>
 #include "runtime/device/gpu/cuda_common.h"

 void PMEReciprocalForce(int fftx, int ffty, int fftz, int atom_numbers, float beta, float *PME_BC, int *pme_uxyz,
                        float *pme_frxyz, float *PME_Q, float *pme_fq, int *PME_atom_near, int *pme_kxyz,
                        const float *box_length_f, const int *uint_crd_f, const float *charge, float *force,
                        cudaStream_t stream);

 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/getcenter_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/getcenter_kernel.cc
@@ -0,0 +1,27 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/sponge/common/getcenter_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(
  GetCenterOfGeometry,
  KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  GetCenterOfGeometryGpuKernel, float, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/getcenter_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/getcenter_kernel.h
@@ -0,0 +1,89 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_GETCENTER_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_GETCENTER_KERNEL_H_

 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common/getcenter_impl.cuh"

 namespace mindspore {
 namespace kernel {

 template <typename T, typename T1>
 class GetCenterOfGeometryGpuKernel : public GpuKernel {
 public:
  GetCenterOfGeometryGpuKernel() : ele_center_atoms(1) {}
  ~GetCenterOfGeometryGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    center_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "center_numbers"));
    center_numbers_inverse = static_cast<int>(GetAttr<float>(kernel_node, "center_numbers_inverse"));

    auto shape_center_atoms = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);

    for (size_t i = 0; i < shape_center_atoms.size(); i++) ele_center_atoms *= shape_center_atoms[i];
    for (size_t i = 0; i < shape_crd.size(); i++) ele_crd *= shape_crd[i];

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto center_atoms = GetDeviceAddress<const T1>(inputs, 0);
    auto crd = GetDeviceAddress<const T>(inputs, 1);

    auto center_of_geometry = GetDeviceAddress<T>(outputs, 0);

    GetCenterOfGeometry(center_numbers, center_numbers_inverse, center_atoms, crd, center_of_geometry,
                        reinterpret_cast<cudaStream_t>(stream_ptr));

    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_center_atoms * sizeof(T1));
    input_size_list_.push_back(ele_crd * sizeof(T));

    output_size_list_.push_back(3 * sizeof(T));
  }

 private:
  size_t ele_center_atoms = 1;
  size_t ele_crd = 1;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int center_numbers;
  float center_numbers_inverse;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_GETCENTER_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/mdtemperature_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/mdtemperature_kernel.cc
@@ -0,0 +1,31 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/sponge/common/mdtemperature_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(MDTemperature,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      MDTemperatureGpuKernel, float, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/mdtemperature_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/mdtemperature_kernel.h
@@ -0,0 +1,96 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_MDTEMPERATURE_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_MDTEMPERATURE_KERNEL_H_

 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common/mdtemperature_impl.cuh"

 namespace mindspore {
 namespace kernel {

 template <typename T, typename T1>
 class MDTemperatureGpuKernel : public GpuKernel {
 public:
  MDTemperatureGpuKernel() : ele_start(1) {}
  ~MDTemperatureGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    residue_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "residue_numbers"));

    auto shape_start = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_end = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape_atom_vel = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    auto shape_atom_mass = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);

    for (size_t i = 0; i < shape_start.size(); i++) ele_start *= shape_start[i];
    for (size_t i = 0; i < shape_end.size(); i++) ele_end *= shape_end[i];
    for (size_t i = 0; i < shape_atom_vel.size(); i++) ele_atom_vel *= shape_atom_vel[i];
    for (size_t i = 0; i < shape_atom_mass.size(); i++) ele_atom_mass *= shape_atom_mass[i];

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto start = GetDeviceAddress<const T1>(inputs, 0);
    auto end = GetDeviceAddress<const T1>(inputs, 1);
    auto atom_vel_f = GetDeviceAddress<const T>(inputs, 2);
    auto atom_mass = GetDeviceAddress<const T>(inputs, 3);

    auto ek = GetDeviceAddress<T>(outputs, 0);

    MDTemperature(residue_numbers, start, end, atom_vel_f, atom_mass, ek, reinterpret_cast<cudaStream_t>(stream_ptr));

    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_start * sizeof(T1));
    input_size_list_.push_back(ele_end * sizeof(T1));
    input_size_list_.push_back(ele_atom_vel * sizeof(T));
    input_size_list_.push_back(ele_atom_mass * sizeof(T));

    output_size_list_.push_back(residue_numbers * sizeof(T));
  }

 private:
  size_t ele_start = 1;
  size_t ele_end = 1;
  size_t ele_atom_vel = 1;
  size_t ele_atom_mass = 1;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int residue_numbers;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_MDTEMPERATURE_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_energy_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_energy_kernel.cc
@@ -0,0 +1,34 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/sponge/lj/lj_energy_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(LJEnergy,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeUInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      LJEnergyGpuKernel, float, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_energy_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_energy_kernel.h
@@ -0,0 +1,130 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_LJ_LJ_ENERGY_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_LJ_LJ_ENERGY_KERNEL_H_
 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_energy_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename T1>
 class LJEnergyGpuKernel : public GpuKernel {
 public:
  LJEnergyGpuKernel() : ele_uint_crd(1) {}
  ~LJEnergyGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
    cutoff_square = static_cast<float>(GetAttr<float_t>(kernel_node, "cutoff_square"));

    auto shape_uint_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_LJtype = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape_charge = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    auto shape_scaler = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    auto shape_nl_numbers = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
    auto shape_nl_serial = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
    auto shape_d_LJ_a = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);
    auto shape_d_LJ_b = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 7);

    for (size_t i = 0; i < shape_uint_crd.size(); i++) ele_uint_crd *= shape_uint_crd[i];
    for (size_t i = 0; i < shape_LJtype.size(); i++) ele_LJtype *= shape_LJtype[i];
    for (size_t i = 0; i < shape_charge.size(); i++) ele_charge *= shape_charge[i];
    for (size_t i = 0; i < shape_scaler.size(); i++) ele_scaler *= shape_scaler[i];
    // for (size_t i = 0; i < shape_nl.size(); i++) ele_nl *= shape_nl[i];
    for (size_t i = 0; i < shape_d_LJ_a.size(); i++) ele_d_LJ_a *= shape_d_LJ_a[i];
    for (size_t i = 0; i < shape_d_LJ_b.size(); i++) ele_d_LJ_b *= shape_d_LJ_b[i];

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto uint_crd = GetDeviceAddress<T1>(inputs, 0);
    auto LJtype = GetDeviceAddress<T1>(inputs, 1);
    auto charge = GetDeviceAddress<T>(inputs, 2);
    auto scaler = GetDeviceAddress<T>(inputs, 3);
    auto nl_numbers = GetDeviceAddress<T1>(inputs, 4);
    auto nl_serial = GetDeviceAddress<T1>(inputs, 5);
    auto d_LJ_a = GetDeviceAddress<T>(inputs, 6);
    auto d_LJ_b = GetDeviceAddress<T>(inputs, 7);

    auto uint_crd_with_LJ = GetDeviceAddress<T>(workspace, 0);
    auto nl = GetDeviceAddress<T1>(workspace, 1);

    auto d_LJ_energy_atom = GetDeviceAddress<T>(outputs, 0);
    LJEnergy(atom_numbers, cutoff_square, uint_crd, LJtype, charge, scaler, uint_crd_with_LJ, nl_numbers, nl_serial, nl,
             d_LJ_a, d_LJ_b, d_LJ_energy_atom, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_uint_crd * sizeof(T1));
    input_size_list_.push_back(ele_LJtype * sizeof(T1));
    input_size_list_.push_back(ele_charge * sizeof(T));
    input_size_list_.push_back(ele_scaler * sizeof(T));
    input_size_list_.push_back(atom_numbers * sizeof(T1));
    input_size_list_.push_back(max_nl_numbers * sizeof(T1));
    input_size_list_.push_back(ele_d_LJ_a * sizeof(T));
    input_size_list_.push_back(ele_d_LJ_b * sizeof(T));

    workspace_size_list_.push_back(atom_numbers * max_nl_numbers * sizeof(T1));
    workspace_size_list_.push_back(atom_numbers * sizeof(UINT_VECTOR_LJ_TYPE));

    output_size_list_.push_back(atom_numbers * sizeof(T));
  }

 private:
  size_t ele_uint_crd = 1;
  size_t ele_LJtype = 1;
  size_t ele_charge = 1;
  size_t ele_scaler = 1;
  size_t ele_nl = 1;
  size_t ele_d_LJ_a = 1;
  size_t ele_d_LJ_b = 1;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int atom_numbers;
  float cutoff_square;
  int max_nl_numbers = 800;
  struct UINT_VECTOR_LJ_TYPE {
    unsigned int uint_x;
    unsigned int uint_y;
    unsigned int uint_z;
    int LJ_type;
    float charge;
  };
  struct NEIGHBOR_LIST {
    int atom_numbers;
    int *atom_serial;
  };
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_force_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_force_kernel.cc
@@ -0,0 +1,34 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/sponge/lj/lj_force_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(LJForce,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeUInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      LJForceGpuKernel, float, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_force_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_force_kernel.h
@@ -0,0 +1,129 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_LJ_LJ_FORCE_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_LJ_LJ_FORCE_KERNEL_H_
 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_force_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename T1>
 class LJForceGpuKernel : public GpuKernel {
 public:
  LJForceGpuKernel() : ele_uint_crd(1) {}
  ~LJForceGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
    cutoff_square = static_cast<float>(GetAttr<float_t>(kernel_node, "cutoff_square"));

    auto shape_uint_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_LJtype = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape_charge = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    auto shape_scaler = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    auto shape_nl_numbers = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
    auto shape_nl_serial = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
    auto shape_d_LJ_a = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);
    auto shape_d_LJ_b = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 7);

    for (size_t i = 0; i < shape_uint_crd.size(); i++) ele_uint_crd *= shape_uint_crd[i];
    for (size_t i = 0; i < shape_LJtype.size(); i++) ele_LJtype *= shape_LJtype[i];
    for (size_t i = 0; i < shape_charge.size(); i++) ele_charge *= shape_charge[i];
    for (size_t i = 0; i < shape_scaler.size(); i++) ele_scaler *= shape_scaler[i];
    // for (size_t i = 0; i < shape_nl.size(); i++) ele_nl *= shape_nl[i];
    for (size_t i = 0; i < shape_d_LJ_a.size(); i++) ele_d_LJ_a *= shape_d_LJ_a[i];
    for (size_t i = 0; i < shape_d_LJ_b.size(); i++) ele_d_LJ_b *= shape_d_LJ_b[i];

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto uint_crd = GetDeviceAddress<T1>(inputs, 0);
    auto LJtype = GetDeviceAddress<T1>(inputs, 1);
    auto charge = GetDeviceAddress<T>(inputs, 2);
    auto scaler = GetDeviceAddress<T>(inputs, 3);
    auto nl_numbers = GetDeviceAddress<T1>(inputs, 4);
    auto nl_serial = GetDeviceAddress<T1>(inputs, 5);
    auto d_LJ_a = GetDeviceAddress<T>(inputs, 6);
    auto d_LJ_b = GetDeviceAddress<T>(inputs, 7);

    auto uint_crd_with_LJ = GetDeviceAddress<T>(workspace, 0);
    auto nl = GetDeviceAddress<T1>(workspace, 1);

    auto frc = GetDeviceAddress<T>(outputs, 0);
    LJForce(atom_numbers, cutoff_square, uint_crd, LJtype, charge, scaler, uint_crd_with_LJ, nl_numbers, nl_serial, nl,
            d_LJ_a, d_LJ_b, frc, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_uint_crd * sizeof(T1));
    input_size_list_.push_back(ele_LJtype * sizeof(T1));
    input_size_list_.push_back(ele_charge * sizeof(T));
    input_size_list_.push_back(ele_scaler * sizeof(T));
    input_size_list_.push_back(atom_numbers * sizeof(T1));
    input_size_list_.push_back(max_nl_numbers * sizeof(T1));
    input_size_list_.push_back(ele_d_LJ_a * sizeof(T));
    input_size_list_.push_back(ele_d_LJ_b * sizeof(T));

    workspace_size_list_.push_back(atom_numbers * max_nl_numbers * sizeof(T1));
    workspace_size_list_.push_back(atom_numbers * sizeof(UINT_VECTOR_LJ_TYPE));

    output_size_list_.push_back(atom_numbers * 3 * sizeof(T));
  }

 private:
  size_t ele_uint_crd = 1;
  size_t ele_LJtype = 1;
  size_t ele_charge = 1;
  size_t ele_scaler = 1;
  size_t ele_d_LJ_a = 1;
  size_t ele_d_LJ_b = 1;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int atom_numbers;
  float cutoff_square;
  int max_nl_numbers = 800;
  struct UINT_VECTOR_LJ_TYPE {
    unsigned int uint_x;
    unsigned int uint_y;
    unsigned int uint_z;
    int LJ_type;
    float charge;
  };
  struct NEIGHBOR_LIST {
    int atom_numbers;
    int *atom_serial;
  };
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_force_with_pme_direct_force_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_force_with_pme_direct_force_kernel.cc
@@ -0,0 +1,34 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/sponge/lj/lj_force_with_pme_direct_force_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(LJForceWithPMEDirectForce,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeUInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      LJForceWithPMEDirectForceGpuKernel, float, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_force_with_pme_direct_force_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/lj/lj_force_with_pme_direct_force_kernel.h
@@ -0,0 +1,133 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_LJ_LJ_FORCE_WITH_PME_DIRECT_FORCE_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_LJ_LJ_FORCE_WITH_PME_DIRECT_FORCE_KERNEL_H_
 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_force_with_pme_direct_force_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename T1>
 class LJForceWithPMEDirectForceGpuKernel : public GpuKernel {
 public:
  LJForceWithPMEDirectForceGpuKernel() : ele_uint_crd(1) {}
  ~LJForceWithPMEDirectForceGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
    cutoff = static_cast<float>(GetAttr<float_t>(kernel_node, "cutoff"));
    pme_beta = static_cast<float>(GetAttr<float_t>(kernel_node, "pme_beta"));

    auto shape_uint_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_LJtype = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape_charge = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    auto shape_scaler = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    auto shape_nl_numbers = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
    auto shape_nl_serial = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
    auto shape_d_LJ_a = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);
    auto shape_d_LJ_b = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 7);

    for (size_t i = 0; i < shape_uint_crd.size(); i++) ele_uint_crd *= shape_uint_crd[i];
    for (size_t i = 0; i < shape_LJtype.size(); i++) ele_LJtype *= shape_LJtype[i];
    for (size_t i = 0; i < shape_charge.size(); i++) ele_charge *= shape_charge[i];
    for (size_t i = 0; i < shape_scaler.size(); i++) ele_scaler *= shape_scaler[i];
    // for (size_t i = 0; i < shape_nl.size(); i++) ele_nl *= shape_nl[i];
    for (size_t i = 0; i < shape_d_LJ_a.size(); i++) ele_d_LJ_a *= shape_d_LJ_a[i];
    for (size_t i = 0; i < shape_d_LJ_b.size(); i++) ele_d_LJ_b *= shape_d_LJ_b[i];

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto uint_crd = GetDeviceAddress<T1>(inputs, 0);
    auto LJtype = GetDeviceAddress<T1>(inputs, 1);
    auto charge = GetDeviceAddress<T>(inputs, 2);
    auto scaler = GetDeviceAddress<T>(inputs, 3);
    auto nl_numbers = GetDeviceAddress<T1>(inputs, 4);
    auto nl_serial = GetDeviceAddress<T1>(inputs, 5);
    auto d_LJ_a = GetDeviceAddress<T>(inputs, 6);
    auto d_LJ_b = GetDeviceAddress<T>(inputs, 7);

    auto uint_crd_with_LJ = GetDeviceAddress<T>(workspace, 0);
    auto nl = GetDeviceAddress<T1>(workspace, 1);

    auto frc = GetDeviceAddress<T>(outputs, 0);
    LJForceWithPMEDirectForce(atom_numbers, cutoff, pme_beta, uint_crd, LJtype, charge, scaler, uint_crd_with_LJ,
                              nl_numbers, nl_serial, nl, d_LJ_a, d_LJ_b, frc,
                              reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_uint_crd * sizeof(T1));
    input_size_list_.push_back(ele_LJtype * sizeof(T1));
    input_size_list_.push_back(ele_charge * sizeof(T));
    input_size_list_.push_back(ele_scaler * sizeof(T));
    input_size_list_.push_back(atom_numbers * sizeof(T1));
    input_size_list_.push_back(max_nl_numbers * sizeof(T1));
    input_size_list_.push_back(ele_d_LJ_a * sizeof(T));
    input_size_list_.push_back(ele_d_LJ_b * sizeof(T));

    workspace_size_list_.push_back(atom_numbers * max_nl_numbers * sizeof(T1));
    workspace_size_list_.push_back(atom_numbers * sizeof(UINT_VECTOR_LJ_TYPE));

    output_size_list_.push_back(atom_numbers * 3 * sizeof(T));
  }

 private:
  size_t ele_uint_crd = 1;
  size_t ele_LJtype = 1;
  size_t ele_charge = 1;
  size_t ele_scaler = 1;
  size_t ele_nl = 1;
  size_t ele_d_LJ_a = 1;
  size_t ele_d_LJ_b = 1;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int atom_numbers;
  float pme_beta;
  float cutoff;
  int max_nl_numbers = 800;
  struct UINT_VECTOR_LJ_TYPE {
    unsigned int uint_x;
    unsigned int uint_y;
    unsigned int uint_z;
    int LJ_type;
    float charge;
  };
  struct NEIGHBOR_LIST {
    int atom_numbers;
    int *atom_serial;
  };
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_cf_atom_energy_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_cf_atom_energy_kernel.cc
@@ -0,0 +1,34 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_cf_atom_energy_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(Dihedral14CFAtomEnergy,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeUInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      Dihedral14CFAtomEnergyGpuKernel, float, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_cf_atom_energy_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_cf_atom_energy_kernel.h
@@ -0,0 +1,114 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_CF_ATOM_ENERGY_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_CF_ATOM_ENERGY_KERNEL_H_

 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_cf_atom_energy_impl.cuh"

 namespace mindspore {
 namespace kernel {

 template <typename T, typename T1>
 class Dihedral14CFAtomEnergyGpuKernel : public GpuKernel {
 public:
  Dihedral14CFAtomEnergyGpuKernel() : ele_uint_crd(1) {}
  ~Dihedral14CFAtomEnergyGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    dihedral_14_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "dihedral_14_numbers"));
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));

    auto shape_uint_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_LJtype = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape_charge = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    auto shape_boxlength_f = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    auto shape_a_14 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
    auto shape_b_14 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
    auto shape_cf_scale_factor = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);

    for (size_t i = 0; i < shape_uint_crd.size(); i++) ele_uint_crd *= shape_uint_crd[i];
    for (size_t i = 0; i < shape_LJtype.size(); i++) ele_LJtype *= shape_LJtype[i];
    for (size_t i = 0; i < shape_charge.size(); i++) ele_charge *= shape_charge[i];
    for (size_t i = 0; i < shape_boxlength_f.size(); i++) ele_boxlength_f *= shape_boxlength_f[i];
    for (size_t i = 0; i < shape_a_14.size(); i++) ele_a_14 *= shape_a_14[i];
    for (size_t i = 0; i < shape_b_14.size(); i++) ele_b_14 *= shape_b_14[i];
    for (size_t i = 0; i < shape_cf_scale_factor.size(); i++) ele_cf_scale_factor *= shape_cf_scale_factor[i];

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto uint_crd_f = GetDeviceAddress<const T1>(inputs, 0);
    auto LJtype = GetDeviceAddress<const T1>(inputs, 1);
    auto charge = GetDeviceAddress<const T>(inputs, 2);
    auto boxlength_f = GetDeviceAddress<T>(inputs, 3);
    auto a_14 = GetDeviceAddress<const T1>(inputs, 4);
    auto b_14 = GetDeviceAddress<const T1>(inputs, 5);
    auto cf_scale_factor = GetDeviceAddress<T>(inputs, 6);
    auto ene = GetDeviceAddress<T>(outputs, 0);

    Dihedral14CFAtomEnergy(dihedral_14_numbers, atom_numbers, uint_crd_f, LJtype, charge, boxlength_f, a_14, b_14,
                           cf_scale_factor, ene, reinterpret_cast<cudaStream_t>(stream_ptr));

    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_uint_crd * sizeof(T1));
    input_size_list_.push_back(ele_LJtype * sizeof(T1));
    input_size_list_.push_back(ele_charge * sizeof(T));
    input_size_list_.push_back(ele_boxlength_f * sizeof(T));
    input_size_list_.push_back(ele_a_14 * sizeof(T1));
    input_size_list_.push_back(ele_b_14 * sizeof(T1));
    input_size_list_.push_back(ele_cf_scale_factor * sizeof(T));

    output_size_list_.push_back(atom_numbers * sizeof(T));
  }

 private:
  size_t ele_uint_crd = 1;
  size_t ele_LJtype = 1;
  size_t ele_charge = 1;
  size_t ele_boxlength_f = 1;
  size_t ele_a_14 = 1;
  size_t ele_b_14 = 1;
  size_t ele_cf_scale_factor = 1;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int dihedral_14_numbers;
  int atom_numbers;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_CF_ATOM_ENERGY_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_cf_energy_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_cf_energy_kernel.cc
@@ -0,0 +1,34 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_cf_energy_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(Dihedral14CFEnergy,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeUInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      Dihedral14CFEnergyGpuKernel, float, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_cf_energy_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_cf_energy_kernel.h
@@ -0,0 +1,114 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_CF_ENERGY_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_CF_ENERGY_KERNEL_H_

 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_cf_energy_impl.cuh"

 namespace mindspore {
 namespace kernel {

 template <typename T, typename T1>
 class Dihedral14CFEnergyGpuKernel : public GpuKernel {
 public:
  Dihedral14CFEnergyGpuKernel() : ele_uint_crd(1) {}
  ~Dihedral14CFEnergyGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    dihedral_14_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "dihedral_14_numbers"));
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));

    auto shape_uint_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_LJtype = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape_charge = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    auto shape_boxlength_f = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    auto shape_a_14 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
    auto shape_b_14 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
    auto shape_cf_scale_factor = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);

    for (size_t i = 0; i < shape_uint_crd.size(); i++) ele_uint_crd *= shape_uint_crd[i];
    for (size_t i = 0; i < shape_LJtype.size(); i++) ele_LJtype *= shape_LJtype[i];
    for (size_t i = 0; i < shape_charge.size(); i++) ele_charge *= shape_charge[i];
    for (size_t i = 0; i < shape_boxlength_f.size(); i++) ele_boxlength_f *= shape_boxlength_f[i];
    for (size_t i = 0; i < shape_a_14.size(); i++) ele_a_14 *= shape_a_14[i];
    for (size_t i = 0; i < shape_b_14.size(); i++) ele_b_14 *= shape_b_14[i];
    for (size_t i = 0; i < shape_cf_scale_factor.size(); i++) ele_cf_scale_factor *= shape_cf_scale_factor[i];

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto uint_crd_f = GetDeviceAddress<const T1>(inputs, 0);
    auto LJtype = GetDeviceAddress<const T1>(inputs, 1);
    auto charge = GetDeviceAddress<const T>(inputs, 2);
    auto boxlength_f = GetDeviceAddress<T>(inputs, 3);
    auto a_14 = GetDeviceAddress<const T1>(inputs, 4);
    auto b_14 = GetDeviceAddress<const T1>(inputs, 5);
    auto cf_scale_factor = GetDeviceAddress<T>(inputs, 6);
    auto ene = GetDeviceAddress<T>(outputs, 0);

    Dihedral14CFEnergy(dihedral_14_numbers, atom_numbers, uint_crd_f, LJtype, charge, boxlength_f, a_14, b_14,
                       cf_scale_factor, ene, reinterpret_cast<cudaStream_t>(stream_ptr));

    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_uint_crd * sizeof(T1));
    input_size_list_.push_back(ele_LJtype * sizeof(T1));
    input_size_list_.push_back(ele_charge * sizeof(T));
    input_size_list_.push_back(ele_boxlength_f * sizeof(T));
    input_size_list_.push_back(ele_a_14 * sizeof(T1));
    input_size_list_.push_back(ele_b_14 * sizeof(T1));
    input_size_list_.push_back(ele_cf_scale_factor * sizeof(T));

    output_size_list_.push_back(atom_numbers * sizeof(T));
  }

 private:
  size_t ele_uint_crd = 1;
  size_t ele_LJtype = 1;
  size_t ele_charge = 1;
  size_t ele_boxlength_f = 1;
  size_t ele_a_14 = 1;
  size_t ele_b_14 = 1;
  size_t ele_cf_scale_factor = 1;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int dihedral_14_numbers;
  int atom_numbers;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_LJ_FORCE_WITH_DIRECT_CF_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_atom_energy_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_atom_energy_kernel.cc
@@ -0,0 +1,36 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_atom_energy_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(Dihedral14LJAtomEnergy,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeUInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      Dihedral14LJAtomEnergyGpuKernel, float, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_atom_energy_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_atom_energy_kernel.h
@@ -0,0 +1,123 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_LJ_ATOM_ENERGY_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_LJ_ATOM_ENERGY_KERNEL_H_

 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_atom_energy_impl.cuh"

 namespace mindspore {
 namespace kernel {

 template <typename T, typename T1>
 class Dihedral14LJAtomEnergyGpuKernel : public GpuKernel {
 public:
  Dihedral14LJAtomEnergyGpuKernel() : ele_uint_crd(1) {}
  ~Dihedral14LJAtomEnergyGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    dihedral_14_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "dihedral_14_numbers"));
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));

    auto shape_uint_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_LJtype = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape_charge = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    auto shape_boxlength_f = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    auto shape_a_14 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
    auto shape_b_14 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
    auto shape_lj_scale_factor = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);
    auto shape_LJ_type_A = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 7);
    auto shape_LJ_type_B = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 8);

    for (size_t i = 0; i < shape_uint_crd.size(); i++) ele_uint_crd *= shape_uint_crd[i];
    for (size_t i = 0; i < shape_LJtype.size(); i++) ele_LJtype *= shape_LJtype[i];
    for (size_t i = 0; i < shape_charge.size(); i++) ele_charge *= shape_charge[i];
    for (size_t i = 0; i < shape_boxlength_f.size(); i++) ele_boxlength_f *= shape_boxlength_f[i];
    for (size_t i = 0; i < shape_a_14.size(); i++) ele_a_14 *= shape_a_14[i];
    for (size_t i = 0; i < shape_b_14.size(); i++) ele_b_14 *= shape_b_14[i];
    for (size_t i = 0; i < shape_lj_scale_factor.size(); i++) ele_lj_scale_factor *= shape_lj_scale_factor[i];
    for (size_t i = 0; i < shape_LJ_type_A.size(); i++) ele_LJ_type_A *= shape_LJ_type_A[i];
    for (size_t i = 0; i < shape_LJ_type_B.size(); i++) ele_LJ_type_B *= shape_LJ_type_B[i];

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto uint_crd_f = GetDeviceAddress<const T1>(inputs, 0);
    auto LJtype = GetDeviceAddress<const T1>(inputs, 1);
    auto charge = GetDeviceAddress<const T>(inputs, 2);
    auto boxlength_f = GetDeviceAddress<T>(inputs, 3);
    auto a_14 = GetDeviceAddress<const T1>(inputs, 4);
    auto b_14 = GetDeviceAddress<const T1>(inputs, 5);
    auto lj_scale_factor = GetDeviceAddress<T>(inputs, 6);
    auto LJ_type_A = GetDeviceAddress<T>(inputs, 7);
    auto LJ_type_B = GetDeviceAddress<T>(inputs, 8);
    auto ene = GetDeviceAddress<T>(outputs, 0);

    Dihedral14LJAtomEnergy(dihedral_14_numbers, atom_numbers, uint_crd_f, LJtype, charge, boxlength_f, a_14, b_14,
                           lj_scale_factor, LJ_type_A, LJ_type_B, ene, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_uint_crd * sizeof(T1));
    input_size_list_.push_back(ele_LJtype * sizeof(T1));
    input_size_list_.push_back(ele_charge * sizeof(T));
    input_size_list_.push_back(ele_boxlength_f * sizeof(T));
    input_size_list_.push_back(ele_a_14 * sizeof(T1));
    input_size_list_.push_back(ele_b_14 * sizeof(T1));
    input_size_list_.push_back(ele_lj_scale_factor * sizeof(T));
    input_size_list_.push_back(ele_LJ_type_A * sizeof(T));
    input_size_list_.push_back(ele_LJ_type_B * sizeof(T));

    output_size_list_.push_back(atom_numbers * sizeof(T));
  }

 private:
  size_t ele_uint_crd = 1;
  size_t ele_LJtype = 1;
  size_t ele_charge = 1;
  size_t ele_boxlength_f = 1;
  size_t ele_a_14 = 1;
  size_t ele_b_14 = 1;
  size_t ele_lj_scale_factor = 1;
  size_t ele_LJ_type_A = 1;
  size_t ele_LJ_type_B = 1;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int dihedral_14_numbers;
  int atom_numbers;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_LJ_ATOM_ENERGY_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_kernel.cc
@@ -0,0 +1,38 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(Dihedral14LJCFForceWithAtomEnergy,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeUInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      Dihedral14LJCFForceWithAtomEnergyGpuKernel, float, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_kernel.h
@@ -0,0 +1,132 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_KERNEL_H_

 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_impl.cuh"

 namespace mindspore {
 namespace kernel {

 template <typename T, typename T1>
 class Dihedral14LJCFForceWithAtomEnergyGpuKernel : public GpuKernel {
 public:
  Dihedral14LJCFForceWithAtomEnergyGpuKernel() : ele_uint_crd(1) {}
  ~Dihedral14LJCFForceWithAtomEnergyGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    dihedral_14_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "dihedral_14_numbers"));
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));

    auto shape_uint_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_LJtype = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape_charge = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    auto shape_boxlength_f = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    auto shape_a_14 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
    auto shape_b_14 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
    auto shape_lj_scale_factor = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);
    auto shape_cf_scale_factor = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 7);
    auto shape_LJ_type_A = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 8);
    auto shape_LJ_type_B = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 9);

    for (size_t i = 0; i < shape_uint_crd.size(); i++) ele_uint_crd *= shape_uint_crd[i];
    for (size_t i = 0; i < shape_LJtype.size(); i++) ele_LJtype *= shape_LJtype[i];
    for (size_t i = 0; i < shape_charge.size(); i++) ele_charge *= shape_charge[i];
    for (size_t i = 0; i < shape_boxlength_f.size(); i++) ele_boxlength_f *= shape_boxlength_f[i];
    for (size_t i = 0; i < shape_a_14.size(); i++) ele_a_14 *= shape_a_14[i];
    for (size_t i = 0; i < shape_b_14.size(); i++) ele_b_14 *= shape_b_14[i];
    for (size_t i = 0; i < shape_lj_scale_factor.size(); i++) ele_lj_scale_factor *= shape_lj_scale_factor[i];
    for (size_t i = 0; i < shape_cf_scale_factor.size(); i++) ele_cf_scale_factor *= shape_cf_scale_factor[i];
    for (size_t i = 0; i < shape_LJ_type_A.size(); i++) ele_LJ_type_A *= shape_LJ_type_A[i];
    for (size_t i = 0; i < shape_LJ_type_B.size(); i++) ele_LJ_type_B *= shape_LJ_type_B[i];

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto uint_crd_f = GetDeviceAddress<const T1>(inputs, 0);
    auto LJtype = GetDeviceAddress<const T1>(inputs, 1);
    auto charge = GetDeviceAddress<const T>(inputs, 2);
    auto boxlength_f = GetDeviceAddress<T>(inputs, 3);
    auto a_14 = GetDeviceAddress<const T1>(inputs, 4);
    auto b_14 = GetDeviceAddress<const T1>(inputs, 5);
    auto lj_scale_factor = GetDeviceAddress<T>(inputs, 6);
    auto cf_scale_factor = GetDeviceAddress<T>(inputs, 7);
    auto LJ_type_A = GetDeviceAddress<T>(inputs, 8);
    auto LJ_type_B = GetDeviceAddress<T>(inputs, 9);
    auto frc_f = GetDeviceAddress<T>(outputs, 0);
    auto atom_energy = GetDeviceAddress<T>(outputs, 1);

    Dihedral14LJCFForceWithAtomEnergy(dihedral_14_numbers, atom_numbers, uint_crd_f, LJtype, charge, boxlength_f, a_14,
                                      b_14, lj_scale_factor, cf_scale_factor, LJ_type_A, LJ_type_B, frc_f, atom_energy,
                                      reinterpret_cast<cudaStream_t>(stream_ptr));

    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_uint_crd * sizeof(T1));
    input_size_list_.push_back(ele_LJtype * sizeof(T1));
    input_size_list_.push_back(ele_charge * sizeof(T));
    input_size_list_.push_back(ele_boxlength_f * sizeof(T));
    input_size_list_.push_back(ele_a_14 * sizeof(T1));
    input_size_list_.push_back(ele_b_14 * sizeof(T1));
    input_size_list_.push_back(ele_lj_scale_factor * sizeof(T));
    input_size_list_.push_back(ele_cf_scale_factor * sizeof(T));
    input_size_list_.push_back(ele_LJ_type_A * sizeof(T));
    input_size_list_.push_back(ele_LJ_type_B * sizeof(T));

    output_size_list_.push_back(3 * atom_numbers * sizeof(T));
    output_size_list_.push_back(atom_numbers * sizeof(T));
  }

 private:
  size_t ele_uint_crd = 1;
  size_t ele_LJtype = 1;
  size_t ele_charge = 1;
  size_t ele_boxlength_f = 1;
  size_t ele_a_14 = 1;
  size_t ele_b_14 = 1;
  size_t ele_lj_scale_factor = 1;
  size_t ele_cf_scale_factor = 1;
  size_t ele_LJ_type_A = 1;
  size_t ele_LJ_type_B = 1;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int dihedral_14_numbers;
  int atom_numbers;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_energy_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_energy_kernel.cc
@@ -0,0 +1,36 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_energy_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(Dihedral14LJEnergy,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeUInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      Dihedral14LJEnergyGpuKernel, float, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_energy_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_energy_kernel.h
@@ -0,0 +1,124 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_LJ_ENERGY_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_LJ_ENERGY_KERNEL_H_

 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_energy_impl.cuh"

 namespace mindspore {
 namespace kernel {

 template <typename T, typename T1>
 class Dihedral14LJEnergyGpuKernel : public GpuKernel {
 public:
  Dihedral14LJEnergyGpuKernel() : ele_uint_crd(1) {}
  ~Dihedral14LJEnergyGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    dihedral_14_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "dihedral_14_numbers"));
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));

    auto shape_uint_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_LJtype = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape_charge = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    auto shape_boxlength_f = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    auto shape_a_14 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
    auto shape_b_14 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
    auto shape_lj_scale_factor = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);
    auto shape_LJ_type_A = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 7);
    auto shape_LJ_type_B = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 8);

    for (size_t i = 0; i < shape_uint_crd.size(); i++) ele_uint_crd *= shape_uint_crd[i];
    for (size_t i = 0; i < shape_LJtype.size(); i++) ele_LJtype *= shape_LJtype[i];
    for (size_t i = 0; i < shape_charge.size(); i++) ele_charge *= shape_charge[i];
    for (size_t i = 0; i < shape_boxlength_f.size(); i++) ele_boxlength_f *= shape_boxlength_f[i];
    for (size_t i = 0; i < shape_a_14.size(); i++) ele_a_14 *= shape_a_14[i];
    for (size_t i = 0; i < shape_b_14.size(); i++) ele_b_14 *= shape_b_14[i];
    for (size_t i = 0; i < shape_lj_scale_factor.size(); i++) ele_lj_scale_factor *= shape_lj_scale_factor[i];
    for (size_t i = 0; i < shape_LJ_type_A.size(); i++) ele_LJ_type_A *= shape_LJ_type_A[i];
    for (size_t i = 0; i < shape_LJ_type_B.size(); i++) ele_LJ_type_B *= shape_LJ_type_B[i];

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto uint_crd_f = GetDeviceAddress<const T1>(inputs, 0);
    auto LJtype = GetDeviceAddress<const T1>(inputs, 1);
    auto charge = GetDeviceAddress<const T>(inputs, 2);
    auto boxlength_f = GetDeviceAddress<T>(inputs, 3);
    auto a_14 = GetDeviceAddress<const T1>(inputs, 4);
    auto b_14 = GetDeviceAddress<const T1>(inputs, 5);
    auto lj_scale_factor = GetDeviceAddress<T>(inputs, 6);
    auto LJ_type_A = GetDeviceAddress<T>(inputs, 7);
    auto LJ_type_B = GetDeviceAddress<T>(inputs, 8);
    auto ene = GetDeviceAddress<T>(outputs, 0);

    Dihedral14LJEnergy(dihedral_14_numbers, atom_numbers, uint_crd_f, LJtype, charge, boxlength_f, a_14, b_14,
                       lj_scale_factor, LJ_type_A, LJ_type_B, ene, reinterpret_cast<cudaStream_t>(stream_ptr));

    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_uint_crd * sizeof(T1));
    input_size_list_.push_back(ele_LJtype * sizeof(T1));
    input_size_list_.push_back(ele_charge * sizeof(T));
    input_size_list_.push_back(ele_boxlength_f * sizeof(T));
    input_size_list_.push_back(ele_a_14 * sizeof(T1));
    input_size_list_.push_back(ele_b_14 * sizeof(T1));
    input_size_list_.push_back(ele_lj_scale_factor * sizeof(T));
    input_size_list_.push_back(ele_LJ_type_A * sizeof(T));
    input_size_list_.push_back(ele_LJ_type_B * sizeof(T));

    output_size_list_.push_back(atom_numbers * sizeof(T));
  }

 private:
  size_t ele_uint_crd = 1;
  size_t ele_LJtype = 1;
  size_t ele_charge = 1;
  size_t ele_boxlength_f = 1;
  size_t ele_a_14 = 1;
  size_t ele_b_14 = 1;
  size_t ele_lj_scale_factor = 1;
  size_t ele_LJ_type_A = 1;
  size_t ele_LJ_type_B = 1;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int dihedral_14_numbers;
  int atom_numbers;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_LJ_ENERGY_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_force_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_force_gpu_kernel.cc
@@ -0,0 +1,36 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_force_gpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(Dihedral14LJForce,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeUInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      Dihedral14LJForceGpuKernel, float, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_force_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_force_gpu_kernel.h
@@ -0,0 +1,122 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_LJ_FORCE_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_LJ_FORCE_KERNEL_H_

 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_force_impl.cuh"

 namespace mindspore {
 namespace kernel {

 template <typename T, typename T1>
 class Dihedral14LJForceGpuKernel : public GpuKernel {
 public:
  Dihedral14LJForceGpuKernel() : ele_uint_crd(1) {}
  ~Dihedral14LJForceGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    dihedral_14_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "dihedral_14_numbers"));
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));

    auto shape_uint_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_LJtype = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape_charge = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    auto shape_boxlength_f = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    auto shape_a_14 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
    auto shape_b_14 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
    auto shape_lj_scale_factor = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);
    auto shape_LJ_type_A = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 7);
    auto shape_LJ_type_B = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 8);

    for (size_t i = 0; i < shape_uint_crd.size(); i++) ele_uint_crd *= shape_uint_crd[i];
    for (size_t i = 0; i < shape_LJtype.size(); i++) ele_LJtype *= shape_LJtype[i];
    for (size_t i = 0; i < shape_charge.size(); i++) ele_charge *= shape_charge[i];
    for (size_t i = 0; i < shape_boxlength_f.size(); i++) ele_boxlength_f *= shape_boxlength_f[i];
    for (size_t i = 0; i < shape_a_14.size(); i++) ele_a_14 *= shape_a_14[i];
    for (size_t i = 0; i < shape_b_14.size(); i++) ele_b_14 *= shape_b_14[i];
    for (size_t i = 0; i < shape_lj_scale_factor.size(); i++) ele_lj_scale_factor *= shape_lj_scale_factor[i];
    for (size_t i = 0; i < shape_LJ_type_A.size(); i++) ele_LJ_type_A *= shape_LJ_type_A[i];
    for (size_t i = 0; i < shape_LJ_type_B.size(); i++) ele_LJ_type_B *= shape_LJ_type_B[i];

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto uint_crd_f = GetDeviceAddress<const T1>(inputs, 0);
    auto LJtype = GetDeviceAddress<const T1>(inputs, 1);
    auto charge = GetDeviceAddress<const T>(inputs, 2);
    auto boxlength_f = GetDeviceAddress<T>(inputs, 3);
    auto a_14 = GetDeviceAddress<const T1>(inputs, 4);
    auto b_14 = GetDeviceAddress<const T1>(inputs, 5);
    auto lj_scale_factor = GetDeviceAddress<T>(inputs, 6);
    auto LJ_type_A = GetDeviceAddress<T>(inputs, 7);
    auto LJ_type_B = GetDeviceAddress<T>(inputs, 8);
    auto frc_f = GetDeviceAddress<T>(outputs, 0);
    Dihedral14LJForce(dihedral_14_numbers, atom_numbers, uint_crd_f, LJtype, charge, boxlength_f, a_14, b_14,
                      lj_scale_factor, LJ_type_A, LJ_type_B, frc_f, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_uint_crd * sizeof(T1));
    input_size_list_.push_back(ele_LJtype * sizeof(T1));
    input_size_list_.push_back(ele_charge * sizeof(T));
    input_size_list_.push_back(ele_boxlength_f * sizeof(T));
    input_size_list_.push_back(ele_a_14 * sizeof(T1));
    input_size_list_.push_back(ele_b_14 * sizeof(T1));
    input_size_list_.push_back(ele_lj_scale_factor * sizeof(T));
    input_size_list_.push_back(ele_LJ_type_A * sizeof(T));
    input_size_list_.push_back(ele_LJ_type_B * sizeof(T));

    output_size_list_.push_back(atom_numbers * 3 * sizeof(T));
  }

 private:
  size_t ele_uint_crd = 1;
  size_t ele_LJtype = 1;
  size_t ele_charge = 1;
  size_t ele_boxlength_f = 1;
  size_t ele_a_14 = 1;
  size_t ele_b_14 = 1;
  size_t ele_lj_scale_factor = 1;
  size_t ele_LJ_type_A = 1;
  size_t ele_LJ_type_B = 1;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int dihedral_14_numbers;
  int atom_numbers;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_LJ_FORCE_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_force_with_direct_cf_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_force_with_direct_cf_kernel.cc
@@ -0,0 +1,37 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_force_with_direct_cf_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(Dihedral14LJForceWithDirectCF,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeUInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      Dihedral14LJForceWithDirectCFGpuKernel, float, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_force_with_direct_cf_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_force_with_direct_cf_kernel.h
@@ -0,0 +1,130 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_LJ_FORCE_WITH_DIRECT_CF_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_LJ_FORCE_WITH_DIRECT_CF_KERNEL_H_

 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_force_with_direct_cf_impl.cuh"

 namespace mindspore {
 namespace kernel {

 template <typename T, typename T1>
 class Dihedral14LJForceWithDirectCFGpuKernel : public GpuKernel {
 public:
  Dihedral14LJForceWithDirectCFGpuKernel() : ele_uint_crd(1) {}
  ~Dihedral14LJForceWithDirectCFGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    dihedral_14_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "dihedral_14_numbers"));
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));

    auto shape_uint_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_LJtype = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape_charge = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    auto shape_boxlength_f = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    auto shape_a_14 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
    auto shape_b_14 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
    auto shape_lj_scale_factor = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);
    auto shape_cf_scale_factor = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 7);
    auto shape_LJ_type_A = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 8);
    auto shape_LJ_type_B = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 9);

    for (size_t i = 0; i < shape_uint_crd.size(); i++) ele_uint_crd *= shape_uint_crd[i];
    for (size_t i = 0; i < shape_LJtype.size(); i++) ele_LJtype *= shape_LJtype[i];
    for (size_t i = 0; i < shape_charge.size(); i++) ele_charge *= shape_charge[i];
    for (size_t i = 0; i < shape_boxlength_f.size(); i++) ele_boxlength_f *= shape_boxlength_f[i];
    for (size_t i = 0; i < shape_a_14.size(); i++) ele_a_14 *= shape_a_14[i];
    for (size_t i = 0; i < shape_b_14.size(); i++) ele_b_14 *= shape_b_14[i];
    for (size_t i = 0; i < shape_lj_scale_factor.size(); i++) ele_lj_scale_factor *= shape_lj_scale_factor[i];
    for (size_t i = 0; i < shape_cf_scale_factor.size(); i++) ele_cf_scale_factor *= shape_cf_scale_factor[i];
    for (size_t i = 0; i < shape_LJ_type_A.size(); i++) ele_LJ_type_A *= shape_LJ_type_A[i];
    for (size_t i = 0; i < shape_LJ_type_B.size(); i++) ele_LJ_type_B *= shape_LJ_type_B[i];

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto uint_crd_f = GetDeviceAddress<const T1>(inputs, 0);
    auto LJtype = GetDeviceAddress<const T1>(inputs, 1);
    auto charge = GetDeviceAddress<const T>(inputs, 2);
    auto boxlength_f = GetDeviceAddress<T>(inputs, 3);
    auto a_14 = GetDeviceAddress<const T1>(inputs, 4);
    auto b_14 = GetDeviceAddress<const T1>(inputs, 5);
    auto lj_scale_factor = GetDeviceAddress<T>(inputs, 6);
    auto cf_scale_factor = GetDeviceAddress<T>(inputs, 7);
    auto LJ_type_A = GetDeviceAddress<T>(inputs, 8);
    auto LJ_type_B = GetDeviceAddress<T>(inputs, 9);
    auto frc_f = GetDeviceAddress<T>(outputs, 0);

    Dihedral14LJForceWithDirectCF(dihedral_14_numbers, atom_numbers, uint_crd_f, LJtype, charge, boxlength_f, a_14,
                                  b_14, lj_scale_factor, cf_scale_factor, LJ_type_A, LJ_type_B, frc_f,
                                  reinterpret_cast<cudaStream_t>(stream_ptr));

    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_uint_crd * sizeof(T1));
    input_size_list_.push_back(ele_LJtype * sizeof(T1));
    input_size_list_.push_back(ele_charge * sizeof(T));
    input_size_list_.push_back(ele_boxlength_f * sizeof(T));
    input_size_list_.push_back(ele_a_14 * sizeof(T1));
    input_size_list_.push_back(ele_b_14 * sizeof(T1));
    input_size_list_.push_back(ele_lj_scale_factor * sizeof(T));
    input_size_list_.push_back(ele_cf_scale_factor * sizeof(T));
    input_size_list_.push_back(ele_LJ_type_A * sizeof(T));
    input_size_list_.push_back(ele_LJ_type_B * sizeof(T));

    output_size_list_.push_back(atom_numbers * 3 * sizeof(T));
  }

 private:
  size_t ele_uint_crd = 1;
  size_t ele_LJtype = 1;
  size_t ele_charge = 1;
  size_t ele_boxlength_f = 1;
  size_t ele_a_14 = 1;
  size_t ele_b_14 = 1;
  size_t ele_lj_scale_factor = 1;
  size_t ele_cf_scale_factor = 1;
  size_t ele_LJ_type_A = 1;
  size_t ele_LJ_type_B = 1;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int dihedral_14_numbers;
  int atom_numbers;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NB14_DIHEDRAL_14_LJ_FORCE_WITH_DIRECT_CF_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_kernel.cc
@@ -0,0 +1,45 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(NeighborListUpdate,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeUInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      NeighborListUpdateGpuKernel, int, float)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_kernel.h
@@ -0,0 +1,170 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_NEIGHBOR_LIST_UPDATE_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_NEIGHBOR_LIST_UPDATE_KERNEL_H_

 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh"

 namespace mindspore {
 namespace kernel {
 template <typename T, typename T1>
 class NeighborListUpdateGpuKernel : public GpuKernel {
 public:
  NeighborListUpdateGpuKernel() : skin(2.0), cutoff(10.0), max_atom_in_grid_numbers(64), max_neighbor_numbers(800) {}
  ~NeighborListUpdateGpuKernel() override = default;
  bool Init(const CNodePtr &kernel_node) override {
    grid_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "grid_numbers"));
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
    refresh_count = static_cast<int>(GetAttr<int64_t>(kernel_node, "refresh_count"));
    refresh_interval = static_cast<int>(GetAttr<int64_t>(kernel_node, "refresh_interval"));
    not_first_time = static_cast<int>(GetAttr<int64_t>(kernel_node, "not_first_time"));
    Nxy = static_cast<int>(GetAttr<int64_t>(kernel_node, "Nxy"));
    excluded_atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "excluded_atom_numbers"));

    cutoff_square = static_cast<float>(GetAttr<float>(kernel_node, "cutoff_square"));
    half_skin_square = static_cast<float>(GetAttr<float>(kernel_node, "half_skin_square"));
    cutoff_with_skin = static_cast<float>(GetAttr<float>(kernel_node, "cutoff_with_skin"));
    half_cutoff_with_skin = static_cast<float>(GetAttr<float>(kernel_node, "half_cutoff_with_skin"));
    cutoff_with_skin_square = static_cast<float>(GetAttr<float>(kernel_node, "cutoff_with_skin_square"));

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspaces,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto atom_numbers_in_grid_bucket = GetDeviceAddress<int>(inputs, 0);
    auto bucket = GetDeviceAddress<int>(inputs, 1);
    auto crd = GetDeviceAddress<float>(inputs, 2);
    auto box_length = GetDeviceAddress<float>(inputs, 3);
    auto grid_N = GetDeviceAddress<int>(inputs, 4);
    auto grid_length_inverse = GetDeviceAddress<float>(inputs, 5);
    auto atom_in_grid_serial = GetDeviceAddress<int>(inputs, 6);
    auto old_crd = GetDeviceAddress<float>(inputs, 7);
    auto crd_to_uint_crd_cof = GetDeviceAddress<float>(inputs, 8);
    auto uint_crd = GetDeviceAddress<unsigned int>(inputs, 9);
    auto gpointer = GetDeviceAddress<int>(inputs, 10);
    auto nl_atom_numbers = GetDeviceAddress<int>(inputs, 11);
    auto nl_atom_serial = GetDeviceAddress<int>(inputs, 12);
    auto uint_dr_to_dr_cof = GetDeviceAddress<float>(inputs, 13);
    auto excluded_list_start = GetDeviceAddress<int>(inputs, 14);
    auto excluded_list = GetDeviceAddress<int>(inputs, 15);
    auto excluded_numbers = GetDeviceAddress<int>(inputs, 16);
    auto need_refresh_flag = GetDeviceAddress<int>(inputs, 17);

    GRID_BUCKET *d_bucket = reinterpret_cast<GRID_BUCKET *>(GetDeviceAddress<int>(workspaces, 0));
    GRID_POINTER *d_gpointer = reinterpret_cast<GRID_POINTER *>(GetDeviceAddress<int>(workspaces, 1));
    NEIGHBOR_LIST *nl = GetDeviceAddress<NEIGHBOR_LIST>(workspaces, 2);
    float *half_crd_to_uint_crd_cof = GetDeviceAddress<float>(workspaces, 3);

    std::vector<GRID_BUCKET> h_bucket(grid_numbers);
    for (size_t i = 0; i < h_bucket.size(); i += 1) {
      h_bucket[i].atom_serial = bucket + i * max_atom_in_grid_numbers;
    }
    std::vector<GRID_POINTER> h_gpointer(grid_numbers);
    for (size_t i = 0; i < h_gpointer.size(); i += 1) {
      h_gpointer[i].grid_serial = gpointer + i * 125;
    }

    cudaMemcpyAsync(d_bucket, h_bucket.data(), sizeof(GRID_BUCKET) * grid_numbers, cudaMemcpyHostToDevice,
                    reinterpret_cast<cudaStream_t>(stream_ptr));
    cudaMemcpyAsync(d_gpointer, h_gpointer.data(), sizeof(GRID_POINTER) * grid_numbers, cudaMemcpyHostToDevice,
                    reinterpret_cast<cudaStream_t>(stream_ptr));
    Construct_Neighbor_List(atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl,
                            reinterpret_cast<cudaStream_t>(stream_ptr));

    Neighbor_List_Update(grid_numbers, atom_numbers, refresh_count, refresh_interval, not_first_time, skin, Nxy,
                         cutoff_square, cutoff_with_skin_square, grid_N, box_length, atom_numbers_in_grid_bucket,
                         grid_length_inverse, atom_in_grid_serial, d_bucket, crd, old_crd, crd_to_uint_crd_cof,
                         half_crd_to_uint_crd_cof, uint_crd, uint_dr_to_dr_cof, d_gpointer, nl, excluded_list_start,
                         excluded_list, excluded_numbers, half_skin_square, need_refresh_flag,
                         reinterpret_cast<cudaStream_t>(stream_ptr));

    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(sizeof(int) * grid_numbers);
    input_size_list_.push_back(sizeof(int) * max_atom_in_grid_numbers * grid_numbers);
    input_size_list_.push_back(sizeof(VECTOR) * atom_numbers);
    input_size_list_.push_back(sizeof(VECTOR));

    input_size_list_.push_back(sizeof(INT_VECTOR));
    input_size_list_.push_back(sizeof(VECTOR));
    input_size_list_.push_back(sizeof(int) * atom_numbers);

    input_size_list_.push_back(sizeof(VECTOR) * atom_numbers);
    input_size_list_.push_back(sizeof(VECTOR));
    input_size_list_.push_back(sizeof(UNSIGNED_INT_VECTOR) * atom_numbers);

    input_size_list_.push_back(sizeof(int) * grid_numbers * 125);
    input_size_list_.push_back(sizeof(int) * atom_numbers);
    input_size_list_.push_back(sizeof(int) * atom_numbers * max_neighbor_numbers);
    input_size_list_.push_back(sizeof(VECTOR));

    input_size_list_.push_back(sizeof(int) * atom_numbers);
    input_size_list_.push_back(sizeof(int) * excluded_atom_numbers);
    input_size_list_.push_back(sizeof(int) * atom_numbers);

    input_size_list_.push_back(sizeof(int));

    workspace_size_list_.push_back(sizeof(GRID_BUCKET) * grid_numbers);
    workspace_size_list_.push_back(sizeof(GRID_POINTER) * grid_numbers);
    workspace_size_list_.push_back(sizeof(NEIGHBOR_LIST) * atom_numbers);
    workspace_size_list_.push_back(sizeof(float) * 3);

    output_size_list_.push_back(sizeof(float));
  }

 private:
  float skin;
  float cutoff;
  int not_first_time;
  int atom_numbers;
  int grid_numbers;
  int refresh_count;
  int refresh_interval;
  int Nxy;
  int max_atom_in_grid_numbers;
  int max_neighbor_numbers;
  int excluded_atom_numbers;
  float half_skin_square;
  float cutoff_square;
  float cutoff_with_skin;
  float half_cutoff_with_skin;
  float cutoff_with_skin_square;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_leap_frog_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_leap_frog_kernel.cc
@@ -0,0 +1,32 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_leap_frog_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_ONE(MDIterationLeapFrog,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      MDIterationLeapFrogGpuKernel, float)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_leap_frog_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nvtit/md_iteration_leap_frog_kernel.h
@@ -0,0 +1,115 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NVTIT_MD_ITERATION_LEAP_FROG_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NVTIT_MD_ITERATION_LEAP_FROG_KERNEL_H_

 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nvtit/md_iteration_leap_frog_impl.cuh"

 namespace mindspore {
 namespace kernel {

 template <typename T>
 class MDIterationLeapFrogGpuKernel : public GpuKernel {
 public:
  MDIterationLeapFrogGpuKernel() : ele_mass_inverse(1) {}
  ~MDIterationLeapFrogGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    float4_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "float4_numbers"));
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
    half_dt = static_cast<float>(GetAttr<float>(kernel_node, "half_dt"));
    dt = static_cast<float>(GetAttr<float>(kernel_node, "dt"));
    exp_gamma = static_cast<float>(GetAttr<float>(kernel_node, "exp_gamma"));
    is_max_velocity = static_cast<int>(GetAttr<int64_t>(kernel_node, "is_max_velocity"));
    max_velocity = static_cast<float>(GetAttr<float>(kernel_node, "max_velocity"));

    // printf("float4_numbers: %d", float4_numbers);
    // printf("atom_numbers: %d", atom_numbers);
    // printf("half_dt: %f", half_dt);
    // printf("dt: %f", dt);
    // printf("exp_gamma: %f", exp_gamma);
    // printf("is_max_velocity: %d", is_max_velocity);
    // printf("max_velocity: %f", max_velocity);

    auto shape_mass_inverse = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_qrt_mass = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);

    for (size_t i = 0; i < shape_mass_inverse.size(); i++) ele_mass_inverse *= shape_mass_inverse[i];
    for (size_t i = 0; i < shape_qrt_mass.size(); i++) ele_sqrt_mass *= shape_qrt_mass[i];

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto d_mass_inverse = GetDeviceAddress<const T>(inputs, 0);
    auto d_sqrt_mass = GetDeviceAddress<const T>(inputs, 1);

    auto vel_f = GetDeviceAddress<T>(outputs, 0);
    auto crd_f = GetDeviceAddress<T>(outputs, 1);
    auto frc_f = GetDeviceAddress<T>(outputs, 2);
    auto acc_f = GetDeviceAddress<T>(outputs, 3);

    MDIterationLeapFrog(float4_numbers, atom_numbers, half_dt, dt, exp_gamma, is_max_velocity, max_velocity,
                        d_mass_inverse, d_sqrt_mass, vel_f, crd_f, frc_f, acc_f,
                        reinterpret_cast<cudaStream_t>(stream_ptr));

    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_mass_inverse * sizeof(T));
    input_size_list_.push_back(ele_sqrt_mass * sizeof(T));

    output_size_list_.push_back(3 * atom_numbers * sizeof(T));
    output_size_list_.push_back(3 * atom_numbers * sizeof(T));
    output_size_list_.push_back(3 * atom_numbers * sizeof(T));
    output_size_list_.push_back(3 * atom_numbers * sizeof(T));
  }

 private:
  size_t ele_mass_inverse = 1;
  size_t ele_sqrt_mass = 1;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int float4_numbers;
  int atom_numbers;
  float half_dt;
  float dt;
  float exp_gamma;
  int is_max_velocity;
  float max_velocity;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_NVTIT_MD_ITERATION_LEAP_FROG_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_energy_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_energy_kernel.cc
@@ -0,0 +1,38 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/sponge/pme/pme_energy_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(PMEEnergy,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeUInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      PMEEnergyGpuKernel, float, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_energy_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_energy_kernel.h
@@ -0,0 +1,147 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_PME_PME_ENERGY_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_PME_PME_ENERGY_KERNEL_H_
 #include <cuda_runtime_api.h>
 #include <cufft.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_energy_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename T1>
 class PMEEnergyGpuKernel : public GpuKernel {
 public:
  PMEEnergyGpuKernel() : ele_uint_crd(1) {}
  ~PMEEnergyGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
    beta = static_cast<float>(GetAttr<float_t>(kernel_node, "beta"));
    fftx = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftx"));
    ffty = static_cast<int>(GetAttr<int64_t>(kernel_node, "ffty"));
    fftz = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftz"));
    PME_Nall = fftx * ffty * fftz;
    PME_Nfft = fftx * ffty * (fftz / 2 + 1);

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto boxlength = GetDeviceAddress<T>(inputs, 0);
    auto uint_crd = GetDeviceAddress<T1>(inputs, 1);
    auto charge = GetDeviceAddress<T>(inputs, 2);
    auto nl_numbers = GetDeviceAddress<T1>(inputs, 3);
    auto nl_serial = GetDeviceAddress<T1>(inputs, 4);
    auto scaler = GetDeviceAddress<T>(inputs, 5);
    auto excluded_list_start = GetDeviceAddress<int>(inputs, 6);
    auto excluded_list = GetDeviceAddress<int>(inputs, 7);
    auto excluded_atom_numbers = GetDeviceAddress<int>(inputs, 8);

    auto pme_uxyz = GetDeviceAddress<int>(workspace, 0);       // workspace
    auto pme_frxyz = GetDeviceAddress<float>(workspace, 1);    // workspace
    auto pme_q = GetDeviceAddress<T>(workspace, 2);            // workspace
    auto pme_fq = GetDeviceAddress<float>(workspace, 3);       // workspace
    auto pme_atom_near = GetDeviceAddress<int>(workspace, 4);  // workspace
    auto pme_bc = GetDeviceAddress<float>(workspace, 5);       // workspace
    auto pme_kxyz = GetDeviceAddress<int>(workspace, 6);       // workspace
    auto nl = GetDeviceAddress<T1>(workspace, 7);

    auto reciprocal_ene = GetDeviceAddress<T>(outputs, 0);
    auto self_ene = GetDeviceAddress<T>(outputs, 1);
    auto direct_ene = GetDeviceAddress<T>(outputs, 2);
    auto correction_ene = GetDeviceAddress<T>(outputs, 3);

    PMEEnergy(fftx, ffty, fftz, atom_numbers, beta, boxlength, pme_bc, pme_uxyz, pme_frxyz, pme_q, pme_fq,
              pme_atom_near, pme_kxyz, uint_crd, charge, nl_numbers, nl_serial, nl, scaler, excluded_list_start,
              excluded_list, excluded_atom_numbers, reciprocal_ene, self_ene, direct_ene, correction_ene,
              reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(sizeof(VECTOR));
    input_size_list_.push_back(atom_numbers * sizeof(UNSIGNED_INT_VECTOR));
    input_size_list_.push_back(atom_numbers * sizeof(VECTOR));
    input_size_list_.push_back(atom_numbers * sizeof(T1));
    input_size_list_.push_back(max_nl_numbers * sizeof(T1));
    input_size_list_.push_back(atom_numbers * sizeof(VECTOR));

    input_size_list_.push_back(atom_numbers * sizeof(T1));
    input_size_list_.push_back(atom_numbers * sizeof(T1));
    input_size_list_.push_back(atom_numbers * sizeof(T1));

    workspace_size_list_.push_back(atom_numbers * sizeof(UNSIGNED_INT_VECTOR));
    workspace_size_list_.push_back(atom_numbers * sizeof(VECTOR));
    workspace_size_list_.push_back(PME_Nall * sizeof(T));
    workspace_size_list_.push_back(PME_Nfft * sizeof(cufftComplex));
    workspace_size_list_.push_back(atom_numbers * 64 * sizeof(int));
    workspace_size_list_.push_back(PME_Nfft * sizeof(float));
    workspace_size_list_.push_back(64 * sizeof(UNSIGNED_INT_VECTOR));
    workspace_size_list_.push_back(atom_numbers * max_nl_numbers * sizeof(T1));

    output_size_list_.push_back(sizeof(T));
    output_size_list_.push_back(sizeof(T));
    output_size_list_.push_back(sizeof(T));
    output_size_list_.push_back(sizeof(T));
  }

 private:
  size_t ele_uint_crd = 1;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int atom_numbers;
  int max_nl_numbers = 800;
  int fftx;
  int ffty;
  int fftz;
  float beta;
  int PME_Nall;
  int PME_Nfft;
  struct VECTOR {
    float x;
    float y;
    float z;
  };

  struct UNSIGNED_INT_VECTOR {
    unsigned int uint_x;
    unsigned int uint_y;
    unsigned int uint_z;
  };

  struct NEIGHBOR_LIST {
    int atom_numbers;
    int *atom_serial;
  };
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_excluded_force_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_excluded_force_kernel.cc
@@ -0,0 +1,32 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/sponge/pme/pme_excluded_force_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(PMEExcludedForce,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeUInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      PMEExcludedForceGpuKernel, float, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_excluded_force_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_excluded_force_kernel.h
@@ -0,0 +1,95 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_PME_PME_EXCLUDED_FORCE_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_PME_PME_EXCLUDED_FORCE_KERNEL_H_
 #include <cuda_runtime_api.h>
 #include <cufft.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_excluded_force_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename T1>
 class PMEExcludedForceGpuKernel : public GpuKernel {
 public:
  PMEExcludedForceGpuKernel() : ele_uint_crd(1) {}
  ~PMEExcludedForceGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
    beta = static_cast<float>(GetAttr<float_t>(kernel_node, "beta"));
    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto uint_crd = GetDeviceAddress<int>(inputs, 0);
    auto sacler = GetDeviceAddress<T>(inputs, 1);
    auto charge = GetDeviceAddress<T>(inputs, 2);
    auto excluded_list_start = GetDeviceAddress<int>(inputs, 3);
    auto excluded_list = GetDeviceAddress<int>(inputs, 4);
    auto excluded_atom_numbers = GetDeviceAddress<int>(inputs, 5);

    auto force = GetDeviceAddress<T>(outputs, 0);
    PMEExcludedForce(atom_numbers, beta, uint_crd, sacler, charge, excluded_list_start, excluded_list,
                     excluded_atom_numbers, force, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(atom_numbers * sizeof(UNSIGNED_INT_VECTOR));
    input_size_list_.push_back(atom_numbers * sizeof(VECTOR));
    input_size_list_.push_back(atom_numbers * sizeof(T));
    input_size_list_.push_back(atom_numbers * sizeof(T1));
    input_size_list_.push_back(atom_numbers * sizeof(T1));
    input_size_list_.push_back(atom_numbers * sizeof(T1));

    output_size_list_.push_back(atom_numbers * 3 * sizeof(T));
  }

 private:
  size_t ele_uint_crd = 1;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int atom_numbers;
  float beta;
  struct VECTOR {
    float x;
    float y;
    float z;
  };

  struct UNSIGNED_INT_VECTOR {
    unsigned int uint_x;
    unsigned int uint_y;
    unsigned int uint_z;
  };
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_reciprocal_force_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_reciprocal_force_kernel.cc
@@ -0,0 +1,29 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/sponge/pme/pme_reciprocal_force_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(PMEReciprocalForce,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeUInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      PMEReciprocalForceGpuKernel, float, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_reciprocal_force_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/pme/pme_reciprocal_force_kernel.h
@@ -0,0 +1,119 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_PME_PME_RECIPROCAL_FORCE_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_PME_PME_RECIPROCAL_FORCE_KERNEL_H_
 #include <cuda_runtime_api.h>
 #include <cufft.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/pme/pme_reciprocal_force_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename T1>
 class PMEReciprocalForceGpuKernel : public GpuKernel {
 public:
  PMEReciprocalForceGpuKernel() : ele_uint_crd(1) {}
  ~PMEReciprocalForceGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
    beta = static_cast<float>(GetAttr<float_t>(kernel_node, "beta"));
    fftx = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftx"));
    ffty = static_cast<int>(GetAttr<int64_t>(kernel_node, "ffty"));
    fftz = static_cast<int>(GetAttr<int64_t>(kernel_node, "fftz"));
    PME_Nall = fftx * ffty * fftz;
    PME_Nfft = fftx * ffty * (fftz / 2 + 1);

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto boxlength = GetDeviceAddress<T>(inputs, 0);
    auto uint_crd = GetDeviceAddress<const T1>(inputs, 1);
    auto charge = GetDeviceAddress<T>(inputs, 2);

    auto pme_uxyz = GetDeviceAddress<int>(workspace, 0);       // workspace
    auto pme_frxyz = GetDeviceAddress<float>(workspace, 1);    // workspace
    auto pme_q = GetDeviceAddress<T>(workspace, 2);            // workspace
    auto pme_fq = GetDeviceAddress<float>(workspace, 3);       // workspace
    auto pme_atom_near = GetDeviceAddress<int>(workspace, 4);  // workspace
    auto pme_bc = GetDeviceAddress<float>(workspace, 5);       // workspace
    auto pme_kxyz = GetDeviceAddress<int>(workspace, 6);       // workspace

    auto force = GetDeviceAddress<T>(outputs, 0);

    PMEReciprocalForce(fftx, ffty, fftz, atom_numbers, beta, pme_bc, pme_uxyz, pme_frxyz, pme_q, pme_fq, pme_atom_near,
                       pme_kxyz, boxlength, uint_crd, charge, force, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(sizeof(VECTOR));
    input_size_list_.push_back(atom_numbers * sizeof(UNSIGNED_INT_VECTOR));
    input_size_list_.push_back(atom_numbers * sizeof(VECTOR));

    workspace_size_list_.push_back(atom_numbers * sizeof(UNSIGNED_INT_VECTOR));
    workspace_size_list_.push_back(atom_numbers * sizeof(VECTOR));
    workspace_size_list_.push_back(PME_Nall * sizeof(T));
    workspace_size_list_.push_back(PME_Nfft * sizeof(cufftComplex));
    workspace_size_list_.push_back(atom_numbers * 64 * sizeof(int));
    workspace_size_list_.push_back(PME_Nfft * sizeof(float));
    workspace_size_list_.push_back(64 * sizeof(UNSIGNED_INT_VECTOR));

    output_size_list_.push_back(atom_numbers * sizeof(VECTOR));
  }

 private:
  size_t ele_uint_crd = 1;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int atom_numbers;
  int fftx;
  int ffty;
  int fftz;
  float beta;
  int PME_Nall;
  int PME_Nfft;

  struct VECTOR {
    float x;
    float y;
    float z;
  };

  struct UNSIGNED_INT_VECTOR {
    unsigned int uint_x;
    unsigned int uint_y;
    unsigned int uint_z;
  };
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif
--- a/mindspore/ccsrc/cxx_api/CMakeLists.txt
+++ b/mindspore/ccsrc/cxx_api/CMakeLists.txt
@@ -114,7 +114,8 @@ if(ENABLE_GPU)
                          ${CUDNN_LIBRARY_PATH}
                          ${CUDA_PATH}/lib64/libcudart.so
                          ${CUDA_PATH}/lib64/stubs/libcuda.so
                          ${CUDA_PATH}/lib64/libcusolver.so)
                          ${CUDA_PATH}/lib64/libcusolver.so
                          ${CUDA_PATH}/lib64/libcufft.so)
 endif()

 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
--- a/mindspore/ops/operations/init.py
+++ b/mindspore/ops/operations/init.py
@@ -100,8 +100,13 @@ from ._embedding_cache_ops import (CacheSwapHashmap, SearchCacheIdx, CacheSwapTa
                                   MapUniform, DynamicAssign, PadAndShift)
 from .quantum_ops import PQC, Evolution
 from .sponge_ops import (BondForce, BondEnergy, BondAtomEnergy, BondForceWithAtomEnergy, BondForceWithAtomVirial,
                         DihedralForce, DihedralEnergy, DihedralAtomEnergy, DihedralForceWithAtomEnergy,
                         AngleForce, AngleEnergy, AngleAtomEnergy, AngleForceWithAtomEnergy)
                         DihedralForce, DihedralEnergy, DihedralAtomEnergy, DihedralForceWithAtomEnergy, AngleForce,
                         AngleEnergy, AngleAtomEnergy, AngleForceWithAtomEnergy, PMEReciprocalForce,
                         LJForce, LJEnergy, LJForceWithPMEDirectForce, PMEExcludedForce, PMEEnergy, Dihedral14LJForce,
                         Dihedral14LJForceWithDirectCF, Dihedral14LJEnergy, Dihedral14LJCFForceWithAtomEnergy,
                         Dihedral14LJAtomEnergy, Dihedral14CFEnergy, Dihedral14CFAtomEnergy, MDIterationLeapFrog,
                         GetCenterOfGeometry, MDTemperature, NeighborListUpdate)


 __all__ = [
    'Unique',
@@ -438,6 +443,24 @@ __all__ = [
    "AngleEnergy",
    "AngleAtomEnergy",
    "AngleForceWithAtomEnergy",
    'PMEReciprocalForce',
    'LJForce',
    'LJForceWithPMEDirectForce',
    'LJEnergy',
    'PMEExcludedForce',
    'PMEEnergy',
    "Dihedral14LJForce",
    "Dihedral14LJEnergy",
    "Dihedral14LJForceWithDirectCF",
    "Dihedral14LJCFForceWithAtomEnergy",
    "Dihedral14LJAtomEnergy",
    "Dihedral14CFEnergy",
    "MDIterationLeapFrog",
    "Dihedral14CFAtomEnergy",
    "GetCenterOfGeometry",
    "MDTemperature",
    "NeighborListUpdate",

 ]

 __all__.sort()
--- a/mindspore/ops/operations/sponge_ops.py
+++ b/mindspore/ops/operations/sponge_ops.py