!21132 sponge ops 0730

Merge pull request !21132 from jiahongQian/mater2
4 years ago · dbe8016596
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cu
@@ -0,0 +1,75 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/util.cuh"

 __global__ void BondForceWithAtomEnergyAndVirialKernel(const int bond_numbers, const UNSIGNED_INT_VECTOR *uint_crd,
                                                       const VECTOR *scaler, const int *atom_a, const int *atom_b,
                                                       const float *bond_k, const float *bond_r0, VECTOR *frc,
                                                       float *atom_energy, float *atom_virial) {
  int bond_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (bond_i < bond_numbers) {
    int atom_i = atom_a[bond_i];
    int atom_j = atom_b[bond_i];

    float k = bond_k[bond_i];
    float r0 = bond_r0[bond_i];

    VECTOR dr = Get_Periodic_Displacement(uint_crd[atom_i], uint_crd[atom_j], scaler[0]);
    float abs_r = norm3df(dr.x, dr.y, dr.z);
    float r_1 = 1. / abs_r;

    float tempf2 = abs_r - r0;
    float tempf = 2 * tempf2 * k;
    VECTOR f = tempf * r_1 * dr;

    atomicAdd(&frc[atom_i].x, -f.x);
    atomicAdd(&frc[atom_i].y, -f.y);
    atomicAdd(&frc[atom_i].z, -f.z);

    atomicAdd(&frc[atom_j].x, f.x);
    atomicAdd(&frc[atom_j].y, f.y);
    atomicAdd(&frc[atom_j].z, f.z);

    atomicAdd(&atom_virial[atom_i], -tempf * abs_r);
    atomicAdd(&atom_energy[atom_i], k * tempf2 * tempf2);
  }
 }

 void BondForceWithAtomEnergyAndVirial(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f,
                                      const float *scaler_f, const int *atom_a, const int *atom_b, const float *bond_k,
                                      const float *bond_r0, float *frc_f, float *atom_energy, float *atom_v,
                                      cudaStream_t stream) {
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128, 0, stream>>>(3 * atom_numbers, frc_f, 0.);
  Reset_List<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(atom_numbers, atom_v, 0.);
  Reset_List<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(atom_numbers, atom_energy, 0.);
  size_t thread_per_block = 128;
  size_t block_per_grid = ceilf(static_cast<float>(bond_numbers) / 128);
  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));
  VECTOR *frc = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(frc_f));
  VECTOR *scaler = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(scaler_f));
  BondForceWithAtomEnergyAndVirialKernel<<<block_per_grid, thread_per_block, 0, stream>>>(
    bond_numbers, uint_crd, scaler, atom_a, atom_b, bond_k, bond_r0, frc, atom_energy, atom_v);
  return;
 }

 void BondForceWithAtomEnergyAndVirial(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f,
                                      const float *scaler_f, const int *atom_a, const int *atom_b, const float *bond_k,
                                      const float *bond_r0, float *frc_f, float *atom_energy, float *atom_v,
                                      cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh
@@ -0,0 +1,27 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_BOND_BOND_FORCE_WITH_ATOM_VIRIAL_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_BOND_BOND_FORCE_WITH_ATOM_VIRIAL_IMPL_H_

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void BondForceWithAtomEnergyAndVirial(int bond_numbers, int atom_numbers, const unsigned int *uint_crd_f,
                                      const float *scaler_f, const int *atom_a, const int *atom_b, const float *bond_k,
                                      const float *bond_r0, float *frc_f, float *atom_energy, float *atom_v,
                                      cudaStream_t stream);
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/get_center_of_mass_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/get_center_of_mass_impl.cu
@@ -0,0 +1,44 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common/get_center_of_mass_impl.cuh"

 __global__ void Get_Center_Of_Mass(int residue_numbers, int *start, int *end, VECTOR *crd, float *atom_mass,
                                   float *residue_mass_inverse, VECTOR *center_of_mass) {
  for (int residue_i = blockDim.x * blockIdx.x + threadIdx.x; residue_i < residue_numbers;
       residue_i += gridDim.x * blockDim.x) {
    VECTOR com_lin = {0.0f, 0.0f, 0.0f};
    for (int atom_i = start[residue_i]; atom_i < end[residue_i]; atom_i += 1) {
      com_lin = com_lin + atom_mass[atom_i] * crd[atom_i];
    }
    center_of_mass[residue_i] = residue_mass_inverse[residue_i] * com_lin;
  }
 }

 void GetCenterOfMass(int residue_numbers, int *start, int *end, float *crd_f, float *atom_mass,
                     float *residue_mass_inverse, float *center_of_mass_f, cudaStream_t stream) {
  Reset_List<<<ceilf(static_cast<float>(3. * residue_numbers) / 128), 128, 0, stream>>>(3 * residue_numbers,
                                                                                        center_of_mass_f, 0.);
  VECTOR *crd = reinterpret_cast<VECTOR *>(crd_f);
  VECTOR *center_of_mass = reinterpret_cast<VECTOR *>(center_of_mass_f);
  Get_Center_Of_Mass<<<20, 32, 0, stream>>>(residue_numbers, start, end, crd, atom_mass, residue_mass_inverse,
                                            center_of_mass);
  return;
 }

 void GetCenterOfMass(int residue_numbers, int *start, int *end, float *crd_f, float *atom_mass,
                     float *residue_mass_inverse, float *center_of_mass_f, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/get_center_of_mass_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/get_center_of_mass_impl.cuh
@@ -0,0 +1,26 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_GETCENTEROFMASS_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_GETCENTEROFMASS_IMPL_H_

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void GetCenterOfMass(int residue_numbers, int *start, int *end, float *crd_f, float *atom_mass,
                     float *residue_mass_inverse, float *center_of_mass_f, cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_GETCENTER_IMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/map_center_of_mass_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/map_center_of_mass_impl.cu
@@ -0,0 +1,51 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common/map_center_of_mass_impl.cuh"

 __global__ void Map_Center_Of_Mass(int residue_numbers, int *start, int *end, float scaler, VECTOR *center_of_mass,
                                   VECTOR *box_length, VECTOR *no_wrap_crd, VECTOR *crd) {
  VECTOR trans_vec;
  VECTOR com;
  for (int residue_i = blockDim.x * blockIdx.x + threadIdx.x; residue_i < residue_numbers;
       residue_i += gridDim.x * blockDim.x) {
    com = center_of_mass[residue_i];

    trans_vec.x = com.x - floorf(com.x / box_length[0].x) * box_length[0].x;
    trans_vec.y = com.y - floorf(com.y / box_length[0].y) * box_length[0].y;
    trans_vec.z = com.z - floorf(com.z / box_length[0].z) * box_length[0].z;
    trans_vec = scaler * trans_vec - com;

    for (int atom_i = start[residue_i] + threadIdx.y; atom_i < end[residue_i]; atom_i += blockDim.y) {
      crd[atom_i] = no_wrap_crd[atom_i] + trans_vec;
    }
  }
 }

 void MapCenterOfMass(int residue_numbers, int *start, int *end, float scaler, float *center_of_mass_f,
                     float *box_length_f, float *no_wrap_crd_f, float *crd_f, cudaStream_t stream) {
  VECTOR *crd = reinterpret_cast<VECTOR *>(crd_f);
  VECTOR *no_wrap_crd = reinterpret_cast<VECTOR *>(no_wrap_crd_f);
  VECTOR *box_length = reinterpret_cast<VECTOR *>(box_length_f);
  VECTOR *center_of_mass = reinterpret_cast<VECTOR *>(center_of_mass_f);
  Map_Center_Of_Mass<<<20, {32, 4}, 0, stream>>>(residue_numbers, start, end, scaler, center_of_mass, box_length,
                                                 no_wrap_crd, crd);
  return;
 }

 void MapCenterOfMass(int residue_numbers, int *start, int *end, float scaler, float *center_of_mass_f,
                     float *box_length_f, float *no_wrap_crd_f, float *crd_f, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/map_center_of_mass_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/common/map_center_of_mass_impl.cuh
@@ -0,0 +1,26 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_MAPCENTEROFMASS_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_MAPCENTEROFMASS_IMPL_H_

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void MapCenterOfMass(int residue_numbers, int *start, int *end, float scaler, float *center_of_mass_f,
                     float *box_length_f, float *no_wrap_crd_f, float *crd_f, cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_COMMON_MAPCENTEROFMASS_IMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cu
@@ -0,0 +1,147 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"

 __global__ void LJ_Direct_CF_Force_With_Atom_Energy_CUDA(const int atom_numbers, const NEIGHBOR_LIST *nl,
                                                         const UINT_VECTOR_LJ_TYPE *uint_crd, const VECTOR *boxlength,
                                                         const float *LJ_type_A, const float *LJ_type_B,
                                                         const float cutoff, VECTOR *frc, const float pme_beta,
                                                         const float sqrt_pi, float *atom_energy) {
  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom_i < atom_numbers) {
    NEIGHBOR_LIST nl_i = nl[atom_i];
    int N = nl_i.atom_numbers;
    int atom_j;
    int int_x;
    int int_y;
    int int_z;
    UINT_VECTOR_LJ_TYPE r1 = uint_crd[atom_i], r2;
    VECTOR dr;
    float dr_2;
    float dr_4;
    float dr_8;
    float dr_6;
    float frc_abs = 0.;
    VECTOR frc_lin;
    VECTOR frc_record = {0., 0., 0.};

    float charge_i = r1.charge;
    float charge_j;
    float dr_abs;
    float dr_1;
    float beta_dr;
    float frc_cf_abs;

    float ene_lin = 0.;
    float ene_lin2 = 0.;

    int x, y;
    int atom_pair_LJ_type;
    for (int j = threadIdx.y; j < N; j = j + blockDim.y) {
      atom_j = nl_i.atom_serial[j];
      r2 = uint_crd[atom_j];
      charge_j = r2.charge;

      int_x = r2.uint_x - r1.uint_x;
      int_y = r2.uint_y - r1.uint_y;
      int_z = r2.uint_z - r1.uint_z;
      dr.x = boxlength[0].x * int_x;
      dr.y = boxlength[0].y * int_y;
      dr.z = boxlength[0].z * int_z;
      dr_abs = norm3df(dr.x, dr.y, dr.z);
      if (dr_abs < cutoff) {
        dr_1 = 1. / dr_abs;
        dr_2 = dr_1 * dr_1;
        dr_4 = dr_2 * dr_2;
        dr_8 = dr_4 * dr_4;
        dr_6 = dr_4 * dr_2;

        y = (r2.LJ_type - r1.LJ_type);
        x = y >> 31;
        y = (y ^ x) - x;
        x = r2.LJ_type + r1.LJ_type;
        r2.LJ_type = (x + y) >> 1;
        x = (x - y) >> 1;
        atom_pair_LJ_type = (r2.LJ_type * (r2.LJ_type + 1) >> 1) + x;

        frc_abs = (-LJ_type_A[atom_pair_LJ_type] * dr_6 + LJ_type_B[atom_pair_LJ_type]) * dr_8;
        beta_dr = pme_beta * dr_abs;
        frc_cf_abs = beta_dr * sqrt_pi * expf(-beta_dr * beta_dr) + erfcf(beta_dr);
        frc_cf_abs = frc_cf_abs * dr_2 * dr_1;
        frc_cf_abs = charge_i * charge_j * frc_cf_abs;

        ene_lin2 = ene_lin2 + charge_i * charge_j * erfcf(beta_dr) * dr_1;
        ene_lin =
          ene_lin +
          (0.083333333 * LJ_type_A[atom_pair_LJ_type] * dr_6 - 0.166666666 * LJ_type_B[atom_pair_LJ_type]) * dr_6;

        frc_abs = frc_abs - frc_cf_abs;

        frc_lin.x = frc_abs * dr.x;
        frc_lin.y = frc_abs * dr.y;
        frc_lin.z = frc_abs * dr.z;

        frc_record.x = frc_record.x + frc_lin.x;
        frc_record.y = frc_record.y + frc_lin.y;
        frc_record.z = frc_record.z + frc_lin.z;

        atomicAdd(&frc[atom_j].x, -frc_lin.x);
        atomicAdd(&frc[atom_j].y, -frc_lin.y);
        atomicAdd(&frc[atom_j].z, -frc_lin.z);
      }
    }
    atomicAdd(&frc[atom_i].x, frc_record.x);
    atomicAdd(&frc[atom_i].y, frc_record.y);
    atomicAdd(&frc[atom_i].z, frc_record.z);

    atomicAdd(&atom_energy[atom_i], ene_lin + ene_lin2);
  }
 }

 void LJDirectCFForceWithAtomEnergy(const int atom_numbers, const float cutoff, const float pme_beta,
                                   const int *uint_crd_f, const int *LJtype, const float *charge, const float *scaler_f,
                                   float *uint_crd_with_LJ, int *nl_atom_numbers, int *nl_atom_serial, int *nl,
                                   const float *d_LJ_A, const float *d_LJ_B, float *frc_f, float *atom_energy,
                                   cudaStream_t stream) {
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128, 0, stream>>>(3 * atom_numbers, frc_f, 0.);
  VECTOR *frc = reinterpret_cast<VECTOR *>(frc_f);
  VECTOR *scaler = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(scaler_f));
  int max_neighbor_numbers = 800;
  NEIGHBOR_LIST *nl_a = reinterpret_cast<NEIGHBOR_LIST *>(nl);
  construct_neighbor_list_kernel<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
    atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl_a);

  UINT_VECTOR_LJ_TYPE *uint_crd_with_LJ_a = reinterpret_cast<UINT_VECTOR_LJ_TYPE *>(uint_crd_with_LJ);

  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));

  Copy_Crd_To_New_Crd_Start<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, uint_crd, uint_crd_with_LJ_a, LJtype, charge);

  LJ_Direct_CF_Force_With_Atom_Energy_CUDA<<<ceilf(static_cast<float>(atom_numbers) / 8), thread_LJ, 0, stream>>>(
    atom_numbers, nl_a, uint_crd_with_LJ_a, scaler, d_LJ_A, d_LJ_B, cutoff, frc, pme_beta, TWO_DIVIDED_BY_SQRT_PI,
    atom_energy);
  return;
 }

 void LJDirectCFForceWithAtomEnergy(const int atom_numbers, const float cutoff, const float pme_beta,
                                   const int *uint_crd_f, const int *LJtype, const float *charge, const float *scaler_f,
                                   float *uint_crd_with_LJ, int *nl_atom_numbers, int *nl_atom_serial, int *nl,
                                   const float *d_LJ_A, const float *d_LJ_B, float *frc_f, float *atom_energy,
                                   cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/lj/lj_pme_direct_force_with_atom_energy_impl.cuh
@@ -0,0 +1,29 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_LJ_LJ_PME_DIRECT_FORCE_WITH_ATOM_ENERGY_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_LJ_LJ_PME_DIRECT_FORCE_WITH_ATOM_ENERGY_IMPL_H_

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void LJDirectCFForceWithAtomEnergy(const int atom_numbers, const float cutoff, const float pme_beta,
                                   const int *uint_crd_f, const int *LJtype, const float *charge, const float *scaler_f,
                                   float *uint_crd_with_LJ, int *nl_atom_numbers, int *nl_atom_serial, int *nl,
                                   const float *d_LJ_A, const float *d_LJ_B, float *frc_f, float *atom_energy,
                                   cudaStream_t stream);

 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cu
@@ -0,0 +1,139 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common_sponge.cuh"

 __global__ void Dihedral14LJCFForceWithAtomEnergyAndVirialKernel(
  const int dihedral_14_numbers, const UINT_VECTOR_LJ_TYPE *uint_crd, const VECTOR *scaler, const int *a_14,
  const int *b_14, const float *lj_scale_factor, const float *cf_scale_factor, const float *LJ_type_A,
  const float *LJ_type_B, VECTOR *frc, float *atom_energy, float *atom_virial) {
  int dihedral_14_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (dihedral_14_i < dihedral_14_numbers) {
    UINT_VECTOR_LJ_TYPE r1, r2;
    VECTOR dr;
    float dr_abs;
    float dr2;
    float dr_1;
    float dr_2;
    float dr_4;
    float dr_8;
    float dr_14;
    float frc_abs = 0.;
    VECTOR temp_frc;

    float ene_lin;
    float ene_lin2;

    int x, y;
    int atom_pair_LJ_type;

    int atom_i = a_14[dihedral_14_i];
    int atom_j = b_14[dihedral_14_i];

    r1 = uint_crd[atom_i];
    r2 = uint_crd[atom_j];

    dr = Get_Periodic_Displacement(r2, r1, scaler[0]);

    dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;

    dr_2 = 1.0 / dr2;
    dr_4 = dr_2 * dr_2;
    dr_8 = dr_4 * dr_4;
    dr_14 = dr_8 * dr_4 * dr_2;
    dr_abs = norm3df(dr.x, dr.y, dr.z);
    dr_1 = 1. / dr_abs;

    // CF
    float charge_i = r1.charge;
    float charge_j = r2.charge;
    float frc_cf_abs;
    frc_cf_abs = cf_scale_factor[dihedral_14_i] * dr_2 * dr_1;
    frc_cf_abs = -charge_i * charge_j * frc_cf_abs;
    // LJ
    y = (r2.LJ_type - r1.LJ_type);
    x = y >> 31;
    y = (y ^ x) - x;
    x = r2.LJ_type + r1.LJ_type;
    r2.LJ_type = (x + y) >> 1;
    x = (x - y) >> 1;
    atom_pair_LJ_type = (r2.LJ_type * (r2.LJ_type + 1) >> 1) + x;

    frc_abs = -LJ_type_A[atom_pair_LJ_type] * dr_14 + LJ_type_B[atom_pair_LJ_type] * dr_8;
    frc_abs *= lj_scale_factor[dihedral_14_i];

    frc_abs += frc_cf_abs;
    temp_frc.x = frc_abs * dr.x;
    temp_frc.y = frc_abs * dr.y;
    temp_frc.z = frc_abs * dr.z;

    atomicAdd(&frc[atom_j].x, -temp_frc.x);
    atomicAdd(&frc[atom_j].y, -temp_frc.y);
    atomicAdd(&frc[atom_j].z, -temp_frc.z);
    atomicAdd(&frc[atom_i].x, temp_frc.x);
    atomicAdd(&frc[atom_i].y, temp_frc.y);
    atomicAdd(&frc[atom_i].z, temp_frc.z);

    ene_lin = r1.charge * r2.charge * dr_1;
    ene_lin *= cf_scale_factor[dihedral_14_i];
    ene_lin2 = 0.08333333 * LJ_type_A[atom_pair_LJ_type] * dr_4 * dr_8 -
               0.1666666 * LJ_type_B[atom_pair_LJ_type] * dr_4 * dr_2;  // LJ的A,B系数已经乘以12和6因此要反乘
    ene_lin2 *= lj_scale_factor[dihedral_14_i];

    atomicAdd(&atom_energy[atom_i], ene_lin + ene_lin2);

    atomicAdd(&atom_virial[atom_i], -temp_frc * dr);
  }
 }

 void Dihedral14LJCFForceWithAtomEnergyAndVirial(const int dihedral_14_numbers, const int atom_numbers,
                                                const int *uint_crd_f, const int *LJtype, const float *charge,
                                                float *uint_crd_with_LJ_f, const float *boxlength_f, const int *a_14,
                                                const int *b_14, const float *lj_scale_factor,
                                                const float *cf_scale_factor, const float *LJ_type_A,
                                                const float *LJ_type_B, float *frc_f, float *atom_energy,
                                                float *atom_virial, cudaStream_t stream) {
  size_t thread_per_block = 128;
  size_t block_per_grid = ceilf(static_cast<float>(atom_numbers) / 128);
  UNSIGNED_INT_VECTOR *uint_crd =
    const_cast<UNSIGNED_INT_VECTOR *>(reinterpret_cast<const UNSIGNED_INT_VECTOR *>(uint_crd_f));

  UINT_VECTOR_LJ_TYPE *uint_crd_with_LJ = reinterpret_cast<UINT_VECTOR_LJ_TYPE *>(uint_crd_with_LJ_f);

  Copy_Crd_To_New_Crd_Start<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, uint_crd, uint_crd_with_LJ, LJtype, charge);

  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128, 0, stream>>>(3 * atom_numbers, frc_f, 0.);
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128, 0, stream>>>(atom_numbers, atom_energy, 0.);
  Reset_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 128), 128, 0, stream>>>(atom_numbers, atom_energy, 0.);
  VECTOR *boxlength = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(boxlength_f));
  VECTOR *frc = const_cast<VECTOR *>(reinterpret_cast<const VECTOR *>(frc_f));

  Dihedral14LJCFForceWithAtomEnergyAndVirialKernel<<<block_per_grid, thread_per_block, 0, stream>>>(
    dihedral_14_numbers, uint_crd_with_LJ, boxlength, a_14, b_14, lj_scale_factor, cf_scale_factor, LJ_type_A,
    LJ_type_B, frc, atom_energy, atom_virial);

  return;
 }

 void Dihedral14LJCFForceWithAtomEnergyAndVirial(const int dihedral_14_numbers, const int atom_numbers,
                                                const int *uint_crd_f, const int *LJtype, const float *charge,
                                                float *uint_crd_with_LJ_f, const float *boxlength_f, const int *a_14,
                                                const int *b_14, const float *lj_scale_factor,
                                                const float *cf_scale_factor, const float *LJ_type_A,
                                                const float *LJ_type_B, float *frc_f, float *atom_energy,
                                                float *atom_virial, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh
@@ -0,0 +1,29 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_IMPL_H
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_IMPL_H

 #include <curand_kernel.h>
 #include "runtime/device/gpu/cuda_common.h"

 void Dihedral14LJCFForceWithAtomEnergyAndVirial(const int dihedral_14_numbers, const int atom_numbers,
                                                const int *uint_crd_f, const int *LJtype, const float *charge,
                                                float *uint_crd_with_LJ_f, const float *boxlength_f, const int *a_14,
                                                const int *b_14, const float *lj_scale_factor,
                                                const float *cf_scale_factor, const float *LJ_type_A,
                                                const float *LJ_type_B, float *frc_f, float *atom_energy,
                                                float *atom_virial, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NB14_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_IMPL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_new_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_new_impl.cu
@@ -13,8 +13,31 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_new_impl.cuh"
 #include <stdio.h>
 #include <vector>

 __device__ __host__ VECTOR operator-(const VECTOR &vecb) {
  VECTOR vec;
  vec.x = -vecb.x;
  vec.y = -vecb.y;
  vec.z = -vecb.z;
  return vec;
 }

 __device__ __host__ VECTOR Get_Periodic_Displacement(const VECTOR vec_a, const VECTOR vec_b, const VECTOR box_length) {
  VECTOR dr;
  // dr = vec_a - vec_b;
  dr.x = vec_a.x - vec_b.x;
  dr.y = vec_a.y - vec_b.y;
  dr.x = vec_a.z - vec_b.z;

  dr.x = dr.x - floorf(dr.x / box_length.x + 0.5) * box_length.x;
  dr.y = dr.y - floorf(dr.y / box_length.y + 0.5) * box_length.y;
  dr.z = dr.z - floorf(dr.z / box_length.z + 0.5) * box_length.z;
  return dr;
 }

 __global__ void Copy_List(const int element_numbers, const int *origin_list, int *list) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < element_numbers) {
@@ -32,15 +55,20 @@ __global__ void Crd_To_Uint_Crd(const int atom_numbers, float *scale_factor, con
                                UNSIGNED_INT_VECTOR *uint_crd) {
  int atom_i = blockDim.x * blockIdx.x + threadIdx.x;
  if (atom_i < atom_numbers) {
    uint_crd[atom_i].uint_x = crd[atom_i].x * scale_factor[0];
    uint_crd[atom_i].uint_y = crd[atom_i].y * scale_factor[1];
    uint_crd[atom_i].uint_z = crd[atom_i].z * scale_factor[2];
    /*uint_crd[atom_i].uint_x = 2 * uint_crd[atom_i].uint_x;
    uint_crd[atom_i].uint_y = 2 * uint_crd[atom_i].uint_y;
    uint_crd[atom_i].uint_z = 2 * uint_crd[atom_i].uint_z;*/
    uint_crd[atom_i].uint_x = uint_crd[atom_i].uint_x << 1;
    uint_crd[atom_i].uint_y = uint_crd[atom_i].uint_y << 1;
    uint_crd[atom_i].uint_z = uint_crd[atom_i].uint_z << 1;
    INT_VECTOR tempi;
    VECTOR temp = crd[atom_i];

    temp.x *= scale_factor[0];
    temp.y *= scale_factor[1];
    temp.z *= scale_factor[2];

    tempi.int_x = temp.x;
    tempi.int_y = temp.y;
    tempi.int_z = temp.z;

    uint_crd[atom_i].uint_x = (tempi.int_x << 2);
    uint_crd[atom_i].uint_y = (tempi.int_y << 2);
    uint_crd[atom_i].uint_z = (tempi.int_z << 2);
  }
 }

@@ -80,7 +108,6 @@ __global__ void Crd_Periodic_Map(const int atom_numbers, VECTOR *crd, const floa
    } else {
      crd[atom_i].y = crd[atom_i].y + box_length[1];
    }

    if (crd[atom_i].z >= 0) {
      if (crd[atom_i].z < box_length[2]) {
      } else {
@@ -198,6 +225,21 @@ __global__ void Is_need_refresh_neighbor_list_cuda(const int atom_numbers, const
  }
 }

 __global__ void Is_need_refresh_neighbor_list_cuda(const int atom_numbers, const VECTOR *crd, const VECTOR *old_crd,
                                                   const VECTOR *box_length, const float half_skin_square,
                                                   int *need_refresh_flag) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < atom_numbers) {
    VECTOR r1 = crd[i];
    VECTOR r2 = old_crd[i];
    r1 = Get_Periodic_Displacement(r1, r2, box_length[0]);
    float r1_2 = r1.x * r1.x + r1.y * r1.y + r1.z * r1.z;
    if (r1_2 > half_skin_square) {
      atomicExch(&need_refresh_flag[0], 1);
    }
  }
 }

 __global__ void Delete_Excluded_Atoms_Serial_In_Neighbor_List(const int atom_numbers, NEIGHBOR_LIST *nl,
                                                              const int *excluded_list_start, const int *excluded_list,
                                                              const int *excluded_atom_numbers) {
@@ -245,27 +287,18 @@ void Refresh_Neighbor_List(int *refresh_sign, const int thread, const int atom_n
                           int *excluded_list_start, int *excluded_list, int *excluded_numbers,
                           float cutoff_skin_square, int grid_numbers, float *grid_length_inverse, int *grid_N, int Nxy,
                           cudaStream_t stream) {
  if (refresh_sign[0] == 1) {
    VECTOR trans_vec = {-skin, -skin, -skin};
  std::vector<int> h_refresh_sign(1);
  cudaMemcpyAsync(h_refresh_sign.data(), refresh_sign, sizeof(int), cudaMemcpyDeviceToHost, stream);
  if (h_refresh_sign[0] == 1) {
    Clear_Grid_Bucket<<<ceilf(static_cast<float>(grid_numbers) / thread), thread, 0, stream>>>(
      grid_numbers, atom_numbers_in_grid_bucket, bucket);

    Vector_Translation<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(atom_numbers, crd,
                                                                                                trans_vec);

    Crd_Periodic_Map<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(atom_numbers, crd,
                                                                                              box_length);

    Find_Atom_In_Grid_Serial<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(
      atom_numbers, grid_length_inverse, crd, grid_N, Nxy, atom_in_grid_serial);

    trans_vec.x = -trans_vec.x;
    trans_vec.y = -trans_vec.y;
    trans_vec.z = -trans_vec.z;

    Vector_Translation<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(atom_numbers, crd,
                                                                                                trans_vec);

    Copy_List<<<ceilf(static_cast<float>(3. * atom_numbers) / thread), thread, 0, stream>>>(
      3 * atom_numbers, reinterpret_cast<float *>(crd), reinterpret_cast<float *>(old_crd));

@@ -282,40 +315,10 @@ void Refresh_Neighbor_List(int *refresh_sign, const int thread, const int atom_n
    Delete_Excluded_Atoms_Serial_In_Neighbor_List<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0,
                                                    stream>>>(atom_numbers, d_nl, excluded_list_start, excluded_list,
                                                              excluded_numbers);
    refresh_sign[0] = 0;
    h_refresh_sign[0] = 0;
  }
 }

 void Refresh_Neighbor_List_First_Time(int *refresh_sign, const int thread, const int atom_numbers, VECTOR *crd,
                                      VECTOR *old_crd, UNSIGNED_INT_VECTOR *uint_crd, float *crd_to_uint_crd_cof,
                                      float *uint_dr_to_dr_cof, int *atom_in_grid_serial, const float skin,
                                      float *box_length, const GRID_POINTER *gpointer, GRID_BUCKET *bucket,
                                      int *atom_numbers_in_grid_bucket, NEIGHBOR_LIST *d_nl, int *excluded_list_start,
                                      int *excluded_list, int *excluded_numbers, float cutoff_skin_square,
                                      int grid_numbers, float *grid_length_inverse, int *grid_N, int Nxy,
                                      cudaStream_t stream) {
  VECTOR trans_vec = {skin, skin, skin};
  Clear_Grid_Bucket<<<ceilf(static_cast<float>(grid_numbers) / 32), 32, 0, stream>>>(
    grid_numbers, atom_numbers_in_grid_bucket, bucket);
  Crd_Periodic_Map<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(atom_numbers, crd, box_length);
  Find_Atom_In_Grid_Serial<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, grid_length_inverse, crd, grid_N, Nxy, atom_in_grid_serial);
  Vector_Translation<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(atom_numbers, crd, trans_vec);
  Copy_List<<<ceilf(static_cast<float>(3. * atom_numbers) / 32), 32, 0, stream>>>(
    3 * atom_numbers, reinterpret_cast<float *>(crd), reinterpret_cast<float *>(old_crd));
  Put_Atom_In_Grid_Bucket<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, atom_in_grid_serial, bucket, atom_numbers_in_grid_bucket);
  Crd_To_Uint_Crd<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(atom_numbers, crd_to_uint_crd_cof,
                                                                                   crd, uint_crd);

  Find_atom_neighbors<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0, stream>>>(
    atom_numbers, uint_crd, uint_dr_to_dr_cof, atom_in_grid_serial, gpointer, bucket, atom_numbers_in_grid_bucket, d_nl,
    cutoff_skin_square);
  Delete_Excluded_Atoms_Serial_In_Neighbor_List<<<ceilf(static_cast<float>(atom_numbers) / thread), thread, 0,
                                                  stream>>>(atom_numbers, d_nl, excluded_list_start, excluded_list,
                                                            excluded_numbers);
 }

 __global__ void construct_neighbor_list_kernel(int atom_numbers, int max_neighbor_numbers, int *nl_atom_numbers,
                                               int *nl_atom_serial, NEIGHBOR_LIST *nl) {
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < atom_numbers; i += gridDim.x * blockDim.x) {
@@ -330,15 +333,39 @@ void Construct_Neighbor_List(int atom_numbers, int max_neighbor_numbers, int *nl
    atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl);
 }

 __global__ void copy_neighbor_list_atom_number(int atom_numbers, NEIGHBOR_LIST *nl, int *nl_atom_numbers) {
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < atom_numbers; i += gridDim.x * blockDim.x) {
 __global__ void copy_neighbor_list_atom_number(int atom_numbers, int max_neighbor_numbers, NEIGHBOR_LIST *nl,
                                               int *nl_atom_numbers, int *nl_atom_serial) {
  int i, j;
  for (i = blockIdx.x * blockDim.x + threadIdx.x; i < atom_numbers; i += gridDim.x * blockDim.x) {
    nl_atom_numbers[i] = nl[i].atom_numbers;
    for (j = blockIdx.y * blockDim.y + threadIdx.y; j < max_neighbor_numbers; j += gridDim.y * blockDim.y) {
      if (j < nl_atom_numbers[i]) {
        nl_atom_serial[i * max_neighbor_numbers + j] = nl[i].atom_serial[j];
      } else {
        nl_atom_serial[i * max_neighbor_numbers + j] = 0;
      }
    }
  }
 }

 void CopyNeighborListAtomNumber(int atom_numbers, NEIGHBOR_LIST *nl, int *nl_atom_numbers, cudaStream_t stream) {
  copy_neighbor_list_atom_number<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(atom_numbers, nl,
                                                                                                    nl_atom_numbers);
 __global__ void Reset_List(const int element_numbers, int *list, const int replace_element) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < element_numbers) {
    list[i] = replace_element;
  }
 }

 __global__ void Reset_List(const int element_numbers, float *list, const float replace_element) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < element_numbers) {
    list[i] = replace_element;
  }
 }

 void CopyNeighborListAtomNumber(int atom_numbers, int max_neighbor_numbers, NEIGHBOR_LIST *nl, int *nl_atom_numbers,
                                int *nl_atom_serial, cudaStream_t stream) {
  copy_neighbor_list_atom_number<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
    atom_numbers, max_neighbor_numbers, nl, nl_atom_numbers, nl_atom_serial);
 }

 void Refresh_Neighbor_List_No_Check(int grid_numbers, int atom_numbers, float skin, int Nxy, float cutoff_skin_square,
@@ -348,22 +375,13 @@ void Refresh_Neighbor_List_No_Check(int grid_numbers, int atom_numbers, float sk
                                    UNSIGNED_INT_VECTOR *uint_crd, float *uint_dr_to_dr_cof, GRID_POINTER *gpointer,
                                    NEIGHBOR_LIST *d_nl, int *excluded_list_start, int *excluded_list,
                                    int *excluded_numbers, cudaStream_t stream) {
  VECTOR trans_vec = {-skin, -skin, -skin};

  Clear_Grid_Bucket<<<ceilf(static_cast<float>(grid_numbers) / 32), 32, 0, stream>>>(
    grid_numbers, atom_numbers_in_grid_bucket, bucket);

  Vector_Translation<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(atom_numbers, crd, trans_vec);

  Crd_Periodic_Map<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(atom_numbers, crd, box_length);

  Find_Atom_In_Grid_Serial<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
    atom_numbers, grid_length_inverse, crd, grid_N, Nxy, atom_in_grid_serial);
  trans_vec.x = -trans_vec.x;
  trans_vec.y = -trans_vec.y;
  trans_vec.z = -trans_vec.z;
  Vector_Translation<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(atom_numbers, crd, trans_vec);

  cudaMemcpyAsync(old_crd, crd, sizeof(VECTOR) * atom_numbers, cudaMemcpyDeviceToDevice, stream);

  Put_Atom_In_Grid_Bucket<<<ceilf(static_cast<float>(atom_numbers) / 32), 32, 0, stream>>>(
@@ -387,51 +405,53 @@ __global__ void Mul_half(float *src, float *dst) {
  }
 }

 void Neighbor_List_Update(int grid_numbers, int atom_numbers, int *d_refresh_count, int refresh_interval,
                          int not_first_time, float skin, int Nxy, float cutoff_square, float cutoff_with_skin_square,
                          int *grid_N, float *box_length, int *atom_numbers_in_grid_bucket, float *grid_length_inverse,
                          int *atom_in_grid_serial, GRID_BUCKET *bucket, float *crd, float *old_crd,
                          float *crd_to_uint_crd_cof, float *half_crd_to_uint_crd_cof, unsigned int *uint_crd,
                          float *uint_dr_to_dr_cof, GRID_POINTER *gpointer, NEIGHBOR_LIST *d_nl,
                          int *excluded_list_start, int *excluded_list, int *excluded_numbers, float half_skin_square,
                          int *is_need_refresh_neighbor_list, cudaStream_t stream) {
  if (not_first_time) {
    if (refresh_interval > 0) {
      std::vector<int> refresh_count_list(1);
      cudaMemcpyAsync(refresh_count_list.data(), d_refresh_count, sizeof(int), cudaMemcpyDeviceToHost, stream);
      cudaStreamSynchronize(stream);
      int refresh_count = refresh_count_list[0];

      if (refresh_count % refresh_interval == 0) {
        Mul_half<<<1, 3, 0, stream>>>(crd_to_uint_crd_cof, half_crd_to_uint_crd_cof);
        Refresh_Neighbor_List_No_Check(grid_numbers, atom_numbers, skin, Nxy, cutoff_square, grid_N, box_length,
                                       atom_numbers_in_grid_bucket, grid_length_inverse, atom_in_grid_serial, bucket,
                                       reinterpret_cast<VECTOR *>(crd), reinterpret_cast<VECTOR *>(old_crd),
                                       half_crd_to_uint_crd_cof, reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd),
                                       uint_dr_to_dr_cof, gpointer, d_nl, excluded_list_start, excluded_list,
                                       excluded_numbers, stream);
      }
      refresh_count += 1;
      cudaMemcpyAsync(d_refresh_count, &refresh_count, sizeof(int), cudaMemcpyHostToDevice, stream);
    } else {
      Is_need_refresh_neighbor_list_cuda<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
        atom_numbers, reinterpret_cast<VECTOR *>(crd), reinterpret_cast<VECTOR *>(old_crd), half_skin_square,
        is_need_refresh_neighbor_list);
      Mul_half<<<1, 3, 0, stream>>>(crd_to_uint_crd_cof, half_crd_to_uint_crd_cof);
      Refresh_Neighbor_List(is_need_refresh_neighbor_list, 32, atom_numbers, reinterpret_cast<VECTOR *>(crd),
                            reinterpret_cast<VECTOR *>(old_crd), reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd),
                            half_crd_to_uint_crd_cof, uint_dr_to_dr_cof, atom_in_grid_serial, skin, box_length,
                            gpointer, bucket, atom_numbers_in_grid_bucket, d_nl, excluded_list_start, excluded_list,
                            excluded_numbers, cutoff_with_skin_square, grid_numbers, grid_length_inverse, grid_N, Nxy,
                            stream);
 __global__ void Mul_quarter(float *src, float *dst) {
  int index = threadIdx.x;
  if (index < 3) {
    dst[index] = src[index] * 0.25;
  }
 }

 int refresh_count = 0;

 void Neighbor_List_Update_New(int grid_numbers, int atom_numbers, int *d_refresh_count, int refresh_interval,
                              int not_first_time, float skin, int Nxy, float cutoff_square,
                              float cutoff_with_skin_square, int *grid_N, float *box_length,
                              int *atom_numbers_in_grid_bucket, float *grid_length_inverse, int *atom_in_grid_serial,
                              GRID_BUCKET *bucket, float *crd, float *old_crd, float *crd_to_uint_crd_cof,
                              float *half_crd_to_uint_crd_cof, unsigned int *uint_crd, float *uint_dr_to_dr_cof,
                              GRID_POINTER *gpointer, NEIGHBOR_LIST *d_nl, int *excluded_list_start, int *excluded_list,
                              int *excluded_numbers, float half_skin_square, int *is_need_refresh_neighbor_list,
                              int forced_update, int forced_check, cudaStream_t stream) {
  if (forced_update) {
    Mul_quarter<<<1, 3, 0, stream>>>(crd_to_uint_crd_cof, half_crd_to_uint_crd_cof);
    Refresh_Neighbor_List_No_Check(
      grid_numbers, atom_numbers, skin, Nxy, cutoff_square, grid_N, box_length, atom_numbers_in_grid_bucket,
      grid_length_inverse, atom_in_grid_serial, bucket, reinterpret_cast<VECTOR *>(crd),
      reinterpret_cast<VECTOR *>(old_crd), half_crd_to_uint_crd_cof, reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd),
      uint_dr_to_dr_cof, gpointer, d_nl, excluded_list_start, excluded_list, excluded_numbers, stream);

  } else if (refresh_interval > 0 && !forced_check) {
    if (refresh_count % refresh_interval == 0) {
      Mul_quarter<<<1, 3, 0, stream>>>(crd_to_uint_crd_cof, half_crd_to_uint_crd_cof);
      Refresh_Neighbor_List_No_Check(grid_numbers, atom_numbers, skin, Nxy, cutoff_square, grid_N, box_length,
                                     atom_numbers_in_grid_bucket, grid_length_inverse, atom_in_grid_serial, bucket,
                                     reinterpret_cast<VECTOR *>(crd), reinterpret_cast<VECTOR *>(old_crd),
                                     half_crd_to_uint_crd_cof, reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd),
                                     uint_dr_to_dr_cof, gpointer, d_nl, excluded_list_start, excluded_list,
                                     excluded_numbers, stream);
    }
    refresh_count += 1;
  } else {
    Mul_half<<<1, 3, 0, stream>>>(crd_to_uint_crd_cof, half_crd_to_uint_crd_cof);
    Refresh_Neighbor_List_First_Time(
      is_need_refresh_neighbor_list, 32, atom_numbers, reinterpret_cast<VECTOR *>(crd),
      reinterpret_cast<VECTOR *>(old_crd), reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd), half_crd_to_uint_crd_cof,
      uint_dr_to_dr_cof, atom_in_grid_serial, skin, box_length, gpointer, bucket, atom_numbers_in_grid_bucket, d_nl,
      excluded_list_start, excluded_list, excluded_numbers, cutoff_with_skin_square, grid_numbers, grid_length_inverse,
      grid_N, Nxy, stream);
    Is_need_refresh_neighbor_list_cuda<<<ceilf(static_cast<float>(atom_numbers) / 128), 128, 0, stream>>>(
      atom_numbers, reinterpret_cast<VECTOR *>(crd), reinterpret_cast<VECTOR *>(old_crd),
      reinterpret_cast<VECTOR *>(box_length), half_skin_square, is_need_refresh_neighbor_list);
    Mul_quarter<<<1, 3, 0, stream>>>(crd_to_uint_crd_cof, half_crd_to_uint_crd_cof);
    Refresh_Neighbor_List(is_need_refresh_neighbor_list, 32, atom_numbers, reinterpret_cast<VECTOR *>(crd),
                          reinterpret_cast<VECTOR *>(old_crd), reinterpret_cast<UNSIGNED_INT_VECTOR *>(uint_crd),
                          half_crd_to_uint_crd_cof, uint_dr_to_dr_cof, atom_in_grid_serial, skin, box_length, gpointer,
                          bucket, atom_numbers_in_grid_bucket, d_nl, excluded_list_start, excluded_list,
                          excluded_numbers, cutoff_with_skin_square, grid_numbers, grid_length_inverse, grid_N, Nxy,
                          stream);
  }
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_new_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_new_impl.cuh
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NEIGHBOR_LIST_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NEIGHBOR_LIST_IMPL_H_
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NEIGHBOR_LIST_NEW_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPONGE_NEIGHBOR_LIST_NEW_IMPL_H_

 struct VECTOR {
  float x;
@@ -46,15 +46,17 @@ struct GRID_POINTER {
 void Construct_Neighbor_List(int grid_numbers, int max_neighbor_numbers, int *nl_atom_numbers, int *nl_atom_serial,
                             NEIGHBOR_LIST *nl, cudaStream_t stream);

 void CopyNeighborListAtomNumber(int atom_numbers, NEIGHBOR_LIST *nl, int *nl_atom_numbers, cudaStream_t stream);
 void CopyNeighborListAtomNumber(int atom_numbers, int max_neighbor_numbers, NEIGHBOR_LIST *nl, int *nl_atom_numbers,
                                int *nl_atom_serial, cudaStream_t stream);

 void Neighbor_List_Update(int grid_numbers, int atom_numbers, int* d_refresh_count, int refresh_interval,
                          int not_first_time, float skin, int Nxy, float cutoff_square, float cutoff_with_skin_square,
                          int *grid_N, float *box_length, int *atom_numbers_in_grid_bucket, float *grid_length_inverse,
                          int *atom_in_grid_serial, GRID_BUCKET *bucket, float *crd, float *old_crd,
                          float *crd_to_uint_crd_cof, float *half_crd_to_uint_crd_cof, unsigned int *uint_crd,
                          float *uint_dr_to_dr_cof, GRID_POINTER *gpointer, NEIGHBOR_LIST *d_nl,
                          int *excluded_list_start, int *excluded_list, int *excluded_numbers, float half_skin_square,
                          int *is_need_refresh_neighbor_list, cudaStream_t stream);
 void Neighbor_List_Update_New(int grid_numbers, int atom_numbers, int *d_refresh_count, int refresh_interval,
                              int not_first_time, float skin, int Nxy, float cutoff_square,
                              float cutoff_with_skin_square, int *grid_N, float *box_length,
                              int *atom_numbers_in_grid_bucket, float *grid_length_inverse, int *atom_in_grid_serial,
                              GRID_BUCKET *bucket, float *crd, float *old_crd, float *crd_to_uint_crd_cof,
                              float *half_crd_to_uint_crd_cof, unsigned int *uint_crd, float *uint_dr_to_dr_cof,
                              GRID_POINTER *gpointer, NEIGHBOR_LIST *d_nl, int *excluded_list_start, int *excluded_list,
                              int *excluded_numbers, float half_skin_square, int *is_need_refresh_neighbor_list,
                              int forced_update, int forced_check, cudaStream_t stream);

 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/bond/bond_force_with_atom_energy_and_virial_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/bond/bond_force_with_atom_energy_and_virial_kernel.cc
@@ -0,0 +1,35 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/sponge/bond/bond_force_with_atom_energy_and_virial_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_THREE(BondForceWithAtomEnergyAndVirial,
                        KernelAttr()
                          .AddInputAttr(kNumberTypeUInt32)
                          .AddInputAttr(kNumberTypeFloat32)
                          .AddInputAttr(kNumberTypeInt32)
                          .AddInputAttr(kNumberTypeInt32)
                          .AddInputAttr(kNumberTypeFloat32)
                          .AddInputAttr(kNumberTypeFloat32)
                          .AddOutputAttr(kNumberTypeFloat32)
                          .AddOutputAttr(kNumberTypeFloat32)
                          .AddOutputAttr(kNumberTypeFloat32),
                        BondForceWithAtomEnergyAndVirialGpuKernel, float, int, unsigned int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/bond/bond_force_with_atom_energy_and_virial_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/bond/bond_force_with_atom_energy_and_virial_kernel.h
@@ -0,0 +1,112 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_BOND_BOND_FORCE_WITH_ATOM_ENERGY_AND_VIRIAL_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_BOND_BOND_FORCE_WITH_ATOM_ENERGY_AND_VIRIAL_KERNEL_H_

 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/bond/bond_force_with_atom_energy_and_virial_impl.cuh"

 #include <cuda_runtime_api.h>
 #include <map>
 #include <string>
 #include <vector>

 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"

 namespace mindspore {
 namespace kernel {
 template <typename T, typename T1, typename T2>
 class BondForceWithAtomEnergyAndVirialGpuKernel : public GpuKernel {
 public:
  BondForceWithAtomEnergyAndVirialGpuKernel() : ele_uint_crd(1) {}
  ~BondForceWithAtomEnergyAndVirialGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    // get bond_numbers
    kernel_node_ = kernel_node;
    bond_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "bond_numbers"));
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
    auto shape_uint_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_scaler = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape_atom_a = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    auto shape_atom_b = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    auto shape_bond_k = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
    auto shape_bond_r0 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);

    for (size_t i = 0; i < shape_uint_crd.size(); i++) ele_uint_crd *= shape_uint_crd[i];
    for (size_t i = 0; i < shape_scaler.size(); i++) ele_scaler *= shape_scaler[i];
    for (size_t i = 0; i < shape_atom_a.size(); i++) ele_atom_a *= shape_atom_a[i];
    for (size_t i = 0; i < shape_atom_b.size(); i++) ele_atom_b *= shape_atom_b[i];
    for (size_t i = 0; i < shape_bond_k.size(); i++) ele_bond_k *= shape_bond_k[i];
    for (size_t i = 0; i < shape_bond_r0.size(); i++) ele_bond_r0 *= shape_bond_r0[i];
    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto uint_crd_f = GetDeviceAddress<const T2>(inputs, 0);
    auto scaler_f = GetDeviceAddress<T>(inputs, 1);
    auto atom_a = GetDeviceAddress<const T1>(inputs, 2);
    auto atom_b = GetDeviceAddress<const T1>(inputs, 3);
    auto bond_k = GetDeviceAddress<T>(inputs, 4);
    auto bond_r0 = GetDeviceAddress<T>(inputs, 5);

    auto frc_f = GetDeviceAddress<T>(outputs, 0);
    auto atom_energy = GetDeviceAddress<T>(outputs, 1);
    auto atom_v = GetDeviceAddress<T>(outputs, 2);
    BondForceWithAtomEnergyAndVirial(bond_numbers, atom_numbers, uint_crd_f, scaler_f, atom_a, atom_b, bond_k, bond_r0,
                                     frc_f, atom_energy, atom_v, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_uint_crd * sizeof(T2));
    input_size_list_.push_back(ele_scaler * sizeof(T));
    input_size_list_.push_back(ele_atom_a * sizeof(T1));
    input_size_list_.push_back(ele_atom_b * sizeof(T1));
    input_size_list_.push_back(ele_bond_k * sizeof(T));
    input_size_list_.push_back(ele_bond_r0 * sizeof(T));

    output_size_list_.push_back(atom_numbers * 3 * sizeof(T));
    output_size_list_.push_back(atom_numbers * sizeof(T));
    output_size_list_.push_back(atom_numbers * sizeof(T));
  }

 private:
  size_t ele_uint_crd = 1;
  size_t ele_scaler = 1;
  size_t ele_atom_a = 1;
  size_t ele_atom_b = 1;
  size_t ele_bond_k = 1;
  size_t ele_bond_r0 = 1;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int bond_numbers;
  int atom_numbers;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/get_center_of_mass_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/get_center_of_mass_kernel.cc
@@ -0,0 +1,31 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/sponge/common/get_center_of_mass_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(GetCenterOfMass,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      GetCenterOfMassGpuKernel, float, int)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/get_center_of_mass_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/get_center_of_mass_kernel.h
@@ -0,0 +1,101 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_GET_CENTER_OF_MASS_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_GET_CENTER_OF_MASS_KERNEL_H_

 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common/get_center_of_mass_impl.cuh"

 namespace mindspore {
 namespace kernel {
 template <typename T, typename T1>
 class GetCenterOfMassGpuKernel : public GpuKernel {
 public:
  GetCenterOfMassGpuKernel() : ele_start(1) {}
  ~GetCenterOfMassGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    residue_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "residue_numbers"));

    auto shape_start = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_end = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    auto shape_atom_mass = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    auto shape_residue_mass_inverse = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);

    for (size_t i = 0; i < shape_start.size(); i++) ele_start *= shape_start[i];
    for (size_t i = 0; i < shape_end.size(); i++) ele_end *= shape_end[i];
    for (size_t i = 0; i < shape_crd.size(); i++) ele_crd *= shape_crd[i];
    for (size_t i = 0; i < shape_atom_mass.size(); i++) ele_atom_mass *= shape_atom_mass[i];
    for (size_t i = 0; i < shape_residue_mass_inverse.size(); i++)
      ele_residue_mass_inverse *= shape_residue_mass_inverse[i];

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto start = GetDeviceAddress<T1>(inputs, 0);
    auto end = GetDeviceAddress<T1>(inputs, 1);
    auto crd = GetDeviceAddress<T>(inputs, 2);
    auto atom_mass = GetDeviceAddress<T>(inputs, 3);
    auto residue_mass_inverse = GetDeviceAddress<T>(inputs, 4);

    auto center_of_mass = GetDeviceAddress<T>(outputs, 0);

    GetCenterOfMass(residue_numbers, start, end, crd, atom_mass, residue_mass_inverse, center_of_mass,
                    reinterpret_cast<cudaStream_t>(stream_ptr));

    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_start * sizeof(T1));
    input_size_list_.push_back(ele_end * sizeof(T1));
    input_size_list_.push_back(ele_crd * sizeof(T));
    input_size_list_.push_back(ele_atom_mass * sizeof(T));
    input_size_list_.push_back(ele_residue_mass_inverse * sizeof(T));
    output_size_list_.push_back(3 * sizeof(T) * residue_numbers);
  }

 private:
  size_t ele_start = 1;
  size_t ele_end = 1;
  size_t ele_crd = 1;
  size_t ele_atom_mass = 1;
  size_t ele_residue_mass_inverse = 1;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int residue_numbers;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_GET_CENTER_OF_MASS_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/map_center_of_mass_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/map_center_of_mass_kernel.cc
@@ -0,0 +1,32 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/sponge/common/map_center_of_mass_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(MapCenterOfMass,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      MapCenterOfMassGpuKernel, float, int)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/map_center_of_mass_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/common/map_center_of_mass_kernel.h
@@ -0,0 +1,105 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_MAP_CENTER_OF_MASS_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_MAP_CENTER_OF_MASS_KERNEL_H_

 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/common/map_center_of_mass_impl.cuh"

 namespace mindspore {
 namespace kernel {
 template <typename T, typename T1>
 class MapCenterOfMassGpuKernel : public GpuKernel {
 public:
  MapCenterOfMassGpuKernel() : ele_start(1) {}
  ~MapCenterOfMassGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    residue_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "residue_numbers"));
    scaler = static_cast<float>(GetAttr<float>(kernel_node, "scaler"));

    auto shape_start = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_end = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape_center_of_mass = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    auto shape_box_length = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    auto shape_no_wrap_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
    auto shape_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);

    for (size_t i = 0; i < shape_start.size(); i++) ele_start *= shape_start[i];
    for (size_t i = 0; i < shape_end.size(); i++) ele_end *= shape_end[i];
    for (size_t i = 0; i < shape_center_of_mass.size(); i++) ele_center_of_mass *= shape_center_of_mass[i];
    for (size_t i = 0; i < shape_box_length.size(); i++) ele_box_length *= shape_box_length[i];
    for (size_t i = 0; i < shape_no_wrap_crd.size(); i++) ele_no_wrap_crd *= shape_no_wrap_crd[i];
    for (size_t i = 0; i < shape_crd.size(); i++) ele_crd *= shape_crd[i];

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto start = GetDeviceAddress<T1>(inputs, 0);
    auto end = GetDeviceAddress<T1>(inputs, 1);
    auto center_of_mass = GetDeviceAddress<T>(inputs, 2);
    auto box_length = GetDeviceAddress<T>(inputs, 3);
    auto no_wrap_crd = GetDeviceAddress<T>(inputs, 4);
    auto crd = GetDeviceAddress<T>(inputs, 5);

    MapCenterOfMass(residue_numbers, start, end, scaler, center_of_mass, box_length, no_wrap_crd, crd,
                    reinterpret_cast<cudaStream_t>(stream_ptr));

    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_start * sizeof(T1));
    input_size_list_.push_back(ele_end * sizeof(T1));
    input_size_list_.push_back(ele_center_of_mass * sizeof(T));
    input_size_list_.push_back(ele_box_length * sizeof(T));
    input_size_list_.push_back(ele_no_wrap_crd * sizeof(T));
    input_size_list_.push_back(ele_crd * sizeof(T));
    output_size_list_.push_back(sizeof(T));
  }

 private:
  size_t ele_start = 1;
  size_t ele_end = 1;
  size_t ele_center_of_mass = 1;
  size_t ele_box_length = 1;
  size_t ele_no_wrap_crd = 1;
  size_t ele_crd = 1;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int residue_numbers;
  float scaler;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_COMMON_MAP_CENTER_OF_MASS_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_kernel.cc
@@ -0,0 +1,39 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(Dihedral14LJCFForceWithAtomEnergyAndVirial,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeUInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      Dihedral14LJCFForceWithAtomEnergyAndVirialGpuKernel, float, int)

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_kernel.h
@@ -0,0 +1,145 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_AND_VIRIAL_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONG_DIHEDRAL_14_LJ_CF_FORCE_WITH_ATOM_ENERGY_AND_VIRIAL_KERNEL_H_

 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/nb14/dihedral_14_lj_cf_force_with_atom_energy_and_virial_impl.cuh"

 namespace mindspore {
 namespace kernel {

 template <typename T, typename T1>
 class Dihedral14LJCFForceWithAtomEnergyAndVirialGpuKernel : public GpuKernel {
 public:
  Dihedral14LJCFForceWithAtomEnergyAndVirialGpuKernel() : ele_uint_crd(1) {}
  ~Dihedral14LJCFForceWithAtomEnergyAndVirialGpuKernel() override = default;

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    dihedral_14_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "dihedral_14_numbers"));
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));

    auto shape_uint_crd = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape_LJtype = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape_charge = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    auto shape_boxlength_f = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    auto shape_a_14 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4);
    auto shape_b_14 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 5);
    auto shape_lj_scale_factor = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 6);
    auto shape_cf_scale_factor = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 7);
    auto shape_LJ_type_A = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 8);
    auto shape_LJ_type_B = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 9);

    for (size_t i = 0; i < shape_uint_crd.size(); i++) ele_uint_crd *= shape_uint_crd[i];
    for (size_t i = 0; i < shape_LJtype.size(); i++) ele_LJtype *= shape_LJtype[i];
    for (size_t i = 0; i < shape_charge.size(); i++) ele_charge *= shape_charge[i];
    for (size_t i = 0; i < shape_boxlength_f.size(); i++) ele_boxlength_f *= shape_boxlength_f[i];
    for (size_t i = 0; i < shape_a_14.size(); i++) ele_a_14 *= shape_a_14[i];
    for (size_t i = 0; i < shape_b_14.size(); i++) ele_b_14 *= shape_b_14[i];
    for (size_t i = 0; i < shape_lj_scale_factor.size(); i++) ele_lj_scale_factor *= shape_lj_scale_factor[i];
    for (size_t i = 0; i < shape_cf_scale_factor.size(); i++) ele_cf_scale_factor *= shape_cf_scale_factor[i];
    for (size_t i = 0; i < shape_LJ_type_A.size(); i++) ele_LJ_type_A *= shape_LJ_type_A[i];
    for (size_t i = 0; i < shape_LJ_type_B.size(); i++) ele_LJ_type_B *= shape_LJ_type_B[i];

    InitSizeLists();
    return true;
  }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto uint_crd_f = GetDeviceAddress<const T1>(inputs, 0);
    auto LJtype = GetDeviceAddress<const T1>(inputs, 1);
    auto charge = GetDeviceAddress<const T>(inputs, 2);
    auto boxlength_f = GetDeviceAddress<T>(inputs, 3);
    auto a_14 = GetDeviceAddress<const T1>(inputs, 4);
    auto b_14 = GetDeviceAddress<const T1>(inputs, 5);
    auto lj_scale_factor = GetDeviceAddress<T>(inputs, 6);
    auto cf_scale_factor = GetDeviceAddress<T>(inputs, 7);
    auto LJ_type_A = GetDeviceAddress<T>(inputs, 8);
    auto LJ_type_B = GetDeviceAddress<T>(inputs, 9);
    auto frc_f = GetDeviceAddress<T>(outputs, 0);
    auto atom_energy = GetDeviceAddress<T>(outputs, 1);
    auto atom_virial = GetDeviceAddress<T>(outputs, 2);

    auto uint_crd_with_LJ = GetDeviceAddress<T>(workspace, 0);

    Dihedral14LJCFForceWithAtomEnergyAndVirial(dihedral_14_numbers, atom_numbers, uint_crd_f, LJtype, charge,
                                               uint_crd_with_LJ, boxlength_f, a_14, b_14, lj_scale_factor,
                                               cf_scale_factor, LJ_type_A, LJ_type_B, frc_f, atom_energy, atom_virial,
                                               reinterpret_cast<cudaStream_t>(stream_ptr));

    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(ele_uint_crd * sizeof(T1));
    input_size_list_.push_back(ele_LJtype * sizeof(T1));
    input_size_list_.push_back(ele_charge * sizeof(T));
    input_size_list_.push_back(ele_boxlength_f * sizeof(T));
    input_size_list_.push_back(ele_a_14 * sizeof(T1));
    input_size_list_.push_back(ele_b_14 * sizeof(T1));
    input_size_list_.push_back(ele_lj_scale_factor * sizeof(T));
    input_size_list_.push_back(ele_cf_scale_factor * sizeof(T));
    input_size_list_.push_back(ele_LJ_type_A * sizeof(T));
    input_size_list_.push_back(ele_LJ_type_B * sizeof(T));
    workspace_size_list_.push_back(atom_numbers * sizeof(UINT_VECTOR_LJ_TYPE));

    output_size_list_.push_back(3 * atom_numbers * sizeof(T));
    output_size_list_.push_back(atom_numbers * sizeof(T));
    output_size_list_.push_back(atom_numbers * sizeof(T));
  }

 private:
  size_t ele_uint_crd = 1;
  size_t ele_LJtype = 1;
  size_t ele_charge = 1;
  size_t ele_boxlength_f = 1;
  size_t ele_a_14 = 1;
  size_t ele_b_14 = 1;
  size_t ele_lj_scale_factor = 1;
  size_t ele_cf_scale_factor = 1;
  size_t ele_LJ_type_A = 1;
  size_t ele_LJ_type_B = 1;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  int dihedral_14_numbers;
  int atom_numbers;
  struct UINT_VECTOR_LJ_TYPE {
    unsigned int uint_x;
    unsigned int uint_y;
    unsigned int uint_z;
    int LJ_type;
    float charge;
  };
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_new_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_new_kernel.cc
@@ -14,11 +14,11 @@
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_kernel.h"
 #include "backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_new_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(NeighborListUpdate,
 MS_REG_GPU_KERNEL_TWO(NeighborListUpdateNew,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
@@ -40,6 +40,6 @@ MS_REG_GPU_KERNEL_TWO(NeighborListUpdate,
                        .AddInputAttr(kNumberTypeInt32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      NeighborListUpdateGpuKernel, int, float)
                      NeighborListUpdateNewGpuKernel, int, float)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_new_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/sponge/neighbor_list/neighbor_list_update_new_kernel.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_NEIGHBOR_LIST_UPDATE_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_NEIGHBOR_LIST_UPDATE_KERNEL_H_
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_NEIGHBOR_LIST_UPDATE_NEW_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_SPONGE_NEIGHBOR_LIST_UPDATE_NEW_KERNEL_H_

 #include <cuda_runtime_api.h>
 #include <vector>
@@ -24,15 +24,15 @@
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "runtime/device/gpu/cuda_common.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/sponge/neighbor_list/neighbor_list_new_impl.cuh"

 namespace mindspore {
 namespace kernel {
 template <typename T, typename T1>
 class NeighborListUpdateGpuKernel : public GpuKernel {
 class NeighborListUpdateNewGpuKernel : public GpuKernel {
 public:
  NeighborListUpdateGpuKernel() : skin(2.0), cutoff(10.0), max_atom_in_grid_numbers(64), max_neighbor_numbers(800) {}
  ~NeighborListUpdateGpuKernel() override = default;
  NeighborListUpdateNewGpuKernel() : skin(2.0), cutoff(9.0), max_atom_in_grid_numbers(64), max_neighbor_numbers(800) {}
  ~NeighborListUpdateNewGpuKernel() override = default;
  bool Init(const CNodePtr &kernel_node) override {
    grid_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "grid_numbers"));
    atom_numbers = static_cast<int>(GetAttr<int64_t>(kernel_node, "atom_numbers"));
@@ -46,6 +46,8 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
    cutoff_with_skin = static_cast<float>(GetAttr<float>(kernel_node, "cutoff_with_skin"));
    half_cutoff_with_skin = static_cast<float>(GetAttr<float>(kernel_node, "half_cutoff_with_skin"));
    cutoff_with_skin_square = static_cast<float>(GetAttr<float>(kernel_node, "cutoff_with_skin_square"));
    forced_update = static_cast<int>(GetAttr<int64_t>(kernel_node, "forced_update"));
    forced_check = static_cast<int>(GetAttr<int64_t>(kernel_node, "forced_check"));
    h_bucket.resize(grid_numbers);
    h_gpointer.resize(grid_numbers);
    InitSizeLists();
@@ -99,13 +101,14 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
    Construct_Neighbor_List(atom_numbers, max_neighbor_numbers, nl_atom_numbers, nl_atom_serial, nl,
                            reinterpret_cast<cudaStream_t>(stream_ptr));

    Neighbor_List_Update(grid_numbers, atom_numbers, d_refresh_count, refresh_interval, not_first_time, skin, nxy,
                         cutoff_square, cutoff_with_skin_square, grid_n, box_length, atom_numbers_in_grid_bucket,
                         grid_length_inverse, atom_in_grid_serial, d_bucket, crd, old_crd, crd_to_uint_crd_cof,
                         half_crd_to_uint_crd_cof, uint_crd, uint_dr_to_dr_cof, d_gpointer, nl, excluded_list_start,
                         excluded_list, excluded_numbers, half_skin_square, need_refresh_flag,
                         reinterpret_cast<cudaStream_t>(stream_ptr));
    CopyNeighborListAtomNumber(atom_numbers, nl, nl_atom_numbers, reinterpret_cast<cudaStream_t>(stream_ptr));
    Neighbor_List_Update_New(grid_numbers, atom_numbers, d_refresh_count, refresh_interval, not_first_time, skin, nxy,
                             cutoff_square, cutoff_with_skin_square, grid_n, box_length, atom_numbers_in_grid_bucket,
                             grid_length_inverse, atom_in_grid_serial, d_bucket, crd, old_crd, crd_to_uint_crd_cof,
                             half_crd_to_uint_crd_cof, uint_crd, uint_dr_to_dr_cof, d_gpointer, nl, excluded_list_start,
                             excluded_list, excluded_numbers, half_skin_square, need_refresh_flag, forced_update,
                             forced_check, reinterpret_cast<cudaStream_t>(stream_ptr));
    CopyNeighborListAtomNumber(atom_numbers, max_neighbor_numbers, nl, nl_atom_numbers, nl_atom_serial,
                               reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

@@ -160,6 +163,8 @@ class NeighborListUpdateGpuKernel : public GpuKernel {
  float cutoff_with_skin;
  float half_cutoff_with_skin;
  float cutoff_with_skin_square;
  int forced_update;
  int forced_check;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;