diff --git a/mindspore/_extends/parallel_compile/tbe_compiler/re_construct_json.py b/mindspore/_extends/parallel_compile/tbe_compiler/re_construct_json.py
new file mode 100644
index 0000000000..24323e9842
--- /dev/null
+++ b/mindspore/_extends/parallel_compile/tbe_compiler/re_construct_json.py
@@ -0,0 +1,164 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""re construct json"""
+import json
+
+
+def common_op_info(json_file):
+    """
+    Create more detail info
+    :param json_file: origin json file
+    :return: origin json file
+    """
+    json_file["L1_addr_offset"] = 0
+    json_file["L1_fusion_type"] = -1
+    json_file["L1_workspace_size"] = -1
+    json_file["addr_type"] = 0
+    json_file["slice_offset"] = []
+    json_file["split_index"] = 0
+    json_file["total_shape"] = []
+    json_file["valid_shape"] = []
+    return json_file
+
+
+def create_input(json_info):
+    """
+    Create input, type is "Data"
+    :param json_info: json file
+    :return: ops list
+    """
+    ops = []
+    if "inputs" in json_info and json_info["inputs"] is not None:
+        ori_inputs = json_info["inputs"]
+        for _, item in enumerate(ori_inputs):
+            op_info = {
+                "name": item[0]["name"],
+                "output_desc": [common_op_info(item[0])],
+                "type": "Data"
+            }
+            ops.append(op_info)
+    return ops
+
+
+def create_inout_desc(ori_json):
+    """
+    Create input or output, insert "data_type" attr and other detail infos
+    :param ori_json: input or output list, the item in list is a dict
+    :return: list
+    """
+    if ori_json is None:
+        return "null"
+    out_list = []
+    for _, item in enumerate(ori_json):
+        item[0]["data_type"] = item[0]["dtype"] if "dtype" in item[0] else 0
+        if "ori_format" in item[0] or "ori_shape"in item[0]:
+            item[0]["L1_addr_offset"] = 0
+            item[0]["L1_fusion_type"] = -1
+            item[0]["L1_workspace_size"] = -1
+            item[0]["addr_type"] = 0
+            item[0]["slice_offset"] = []
+            item[0]["split_index"] = 0
+            item[0]["total_shape"] = []
+            item[0]["valid_shape"] = []
+        else:
+            item[0]["shape"] = "NULL"
+        out_list.append(item[0])
+    return out_list
+
+
+def create_pre_build_attr(ori_json):
+    """
+    Create prebuild_outs_attrs
+    :param ori_json: origin json file
+    :return: dict
+    """
+    args = [create_inout_desc(ori_json["outputs"])[0]]
+    if "attrs" in ori_json and ori_json["attrs"] is not None:
+        ori_attrs = ori_json["attrs"]
+        for item in ori_attrs:
+            if "value" in item:
+                args.append(item["value"])
+    pre_build_attr = {"kwds_args": {},
+                      "list_args": args
+                      }
+    return pre_build_attr
+
+
+def create_compute_op(ori_json):
+    """
+    Create compute op's in and out desc
+    :param ori_json: origin json file
+    :return: dict
+    """
+    func_name = ori_json["name"]
+    op_type = ori_json["Type"]
+    full_name = ori_json["full_name"]
+    pattern = ori_json["pattern"] if "pattern" in ori_json else ""
+    op_common_info = {
+        "func_name": func_name,
+        "input_desc": create_inout_desc(ori_json["inputs"]) if "inputs" in ori_json else "null",
+        "module_name": ori_json["module_name"],
+        "name": full_name,
+        "output_desc": create_inout_desc(ori_json["outputs"]) if "outputs" in ori_json else "null",
+        "output_data_desc": create_inout_desc(ori_json["outputs"]) if "outputs" in ori_json else "null",
+        "pattern": pattern,
+        "attr_desc": ori_json["attr_desc"] if "attr_desc" in ori_json else "null",
+        "py_module_path": ori_json["py_module_path"],
+        "type": op_type
+    }
+    return op_common_info
+
+
+def single_to_fusion(json_file, tune_mode):
+    """
+    Change single op json to fusion op json for auto tune
+    :param json_file: origin json file
+    :param tune_mode: tune mode
+    :return: a fusion op json, which contain one op
+    """
+    ori_file = json.loads(json_file)
+    json_info = ori_file["op_info"]
+    soc_info = ori_file["SocInfo"]
+    soc_info["autoTilingMode"] = tune_mode
+    kernel_name = json_info["kernel_name"]
+    ops = create_input(json_info)
+    ops2 = create_compute_op(json_info)
+    ops.append(ops2)
+    end_file = {
+        "SocInfo": soc_info,
+        "fusion_op_name": kernel_name,
+        "l1_size": -1,
+        "op_list": ops
+    }
+    # op_info = {"fusion_op": end_file}
+    res = json.dumps(end_file, ensure_ascii=False)
+    return res
+
+
+def fusion_to_fusion(json_str, tune_mode):
+    """
+    Add l1_size for fusion json
+    :param json_str: origin json file
+    :param tune_mode: tune mode
+    :return: fusion json info
+    """
+
+    json_info = json.loads(json_str)
+    json_info["fusion_op"]["l1_size"] = -1
+    json_info["SocInfo"]["autoTilingMode"] = tune_mode
+    end_file = json_info["fusion_op"]
+    end_file["SocInfo"] = json_info["SocInfo"]
+    res = json.dumps(end_file, ensure_ascii=False)
+    return res
diff --git a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_process.py b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_process.py
index 36c6c655a9..264e43edb5 100644
--- a/mindspore/_extends/parallel_compile/tbe_compiler/tbe_process.py
+++ b/mindspore/_extends/parallel_compile/tbe_compiler/tbe_process.py
@@ -117,6 +117,9 @@ class TbeProcess:
             self.__pool.join()
             del self.__pool
 
+    def init_auto_tune_env(self, mode):
+        return "Success"
+
     def init_process_num(self):
         """
         init compile process num
diff --git a/mindspore/_extends/remote/kernel_build_server_ascend.py b/mindspore/_extends/remote/kernel_build_server_ascend.py
index 1dc513551c..bdd9216851 100644
--- a/mindspore/_extends/remote/kernel_build_server_ascend.py
+++ b/mindspore/_extends/remote/kernel_build_server_ascend.py
@@ -24,6 +24,9 @@ class TbeBuilder:
     def __init__(self):
         self.tbe_builder = create_tbe_parallel_process()
 
+    def init_auto_tune_env(self, mode):
+        return self.tbe_builder.init_auto_tune_env(mode)
+
     def create(self):
         return self.tbe_builder.init_process_num()
 
@@ -75,6 +78,11 @@ class AscendMessager(Messager):
         if arg == 'TBE/PRE':
             ans = self.tbe_builder.create()
             self.send_res(ans)
+        elif arg == "TBE/TUNE":
+            self.send_ack()
+            tune_mode = self.get_message()
+            ans = self.tbe_builder.init_auto_tune_env(tune_mode)
+            self.send_res(ans)
         elif arg == 'TBE/START':
             self.send_ack()
             json = self.get_message()
diff --git a/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.cc b/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.cc
index 8f6b529d7e..084fd8b8d9 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.cc
@@ -55,6 +55,20 @@ std::map<int64_t, KernelModPtr> KernelFusion(const std::vector<FusionScopeInfo>
   std::map<int64_t, KernelModPtr> kernel_mod_ret;
   auto build_manger = std::make_shared<ParallelBuildManager>();
   MS_EXCEPTION_IF_NULL(build_manger);
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+  auto tune_mode = context_ptr->get_param<std::string>(MS_CTX_TUNE_MODE);
+  std::string offline_tune = common::GetEnv("ENABLE_TUNE_DUMP");
+  if (!offline_tune.empty()) {
+    for (size_t j = 0; j < offline_tune.length(); j++) {
+      offline_tune[j] = tolower(offline_tune[j]);
+    }
+    if (!(offline_tune == "true" || offline_tune == "false")) {
+      MS_LOG(EXCEPTION) << "The value of ENABLE_TUNE_DUMP must be 'true' or 'false'";
+    }
+  }
+
   for (const auto &fusion_scope_iter : fusion_scopes) {
     string fusion_kernel_name;
     nlohmann::json fusion_op;
@@ -64,11 +78,9 @@ std::map<int64_t, KernelModPtr> KernelFusion(const std::vector<FusionScopeInfo>
     }
     // gen kernel_name & check cache
     size_t hash_id = GenFusionJsonHash(fusion_op);
-    auto context_ptr = MsContext::GetInstance();
-    MS_EXCEPTION_IF_NULL(context_ptr);
-    auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
     auto json_name =
       fusion_kernel_name.append("_").append(std::to_string(hash_id)).append("_").append(std::to_string(device_id));
+    fusion_op["graph_id"] = fusion_scope_iter.graph_id;
     fusion_op["fusion_op_name"] = json_name;
     // get io size
     std::vector<size_t> input_size_list;
@@ -79,7 +91,7 @@ std::map<int64_t, KernelModPtr> KernelFusion(const std::vector<FusionScopeInfo>
     }
     // search cache
     auto kernel_pack = TbeUtils::SearchCache(json_name, tbe::kProcessorAiCore);
-    if (kernel_pack != nullptr) {
+    if (kernel_pack != nullptr && ((!offline_tune.empty() && offline_tune != "true") || tune_mode == "NO_TUNE")) {
       auto kernel_mod =
         build_manger->GenKernelMod(json_name, tbe::kProcessorAiCore, input_size_list, output_size_list, kernel_pack);
       if (kernel_mod != nullptr) {
@@ -87,9 +99,16 @@ std::map<int64_t, KernelModPtr> KernelFusion(const std::vector<FusionScopeInfo>
         continue;
       }
     }
+    // generate soc info json
+    nlohmann::json soc_info_json;
+    TbeUtils::GenSocInfo(&soc_info_json);
+    soc_info_json["autoTilingMode"] = tune_mode;
+    auto soc_version = TbeKernelJsonCreator::GetSocVersion();
+    soc_info_json["socVersion"] = soc_version;
     // fusion build
     nlohmann::json fusion_json;
     fusion_json["fusion_op"] = fusion_op;
+    fusion_json["SocInfo"] = soc_info_json;
     auto task_id = build_manger->StartCompileOp(fusion_json);
     TbeUtils::SaveJsonInfo(json_name, fusion_json.dump());
     if (task_id < 0) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.h b/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.h
index 22361124ea..badd8f50b8 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/kernel_fusion.h
@@ -26,9 +26,15 @@ namespace kernel {
  * @brief fuse op and return a callable mod
  */
 struct FusionScopeInfo {
-  FusionScopeInfo(int64_t id, std::vector<AnfNodePtr> in, std::vector<AnfNodePtr> comp, std::vector<AnfNodePtr> out)
-      : scope_id(id), input_nodes(std::move(in)), compute_nodes(std::move(comp)), output_nodes(std::move(out)) {}
+  FusionScopeInfo(int64_t id, uint32_t g_id, std::vector<AnfNodePtr> in, std::vector<AnfNodePtr> comp,
+                  std::vector<AnfNodePtr> out)
+      : scope_id(id),
+        graph_id(g_id),
+        input_nodes(std::move(in)),
+        compute_nodes(std::move(comp)),
+        output_nodes(std::move(out)) {}
   int64_t scope_id{};
+  uint32_t graph_id{};
   std::vector<AnfNodePtr> input_nodes;
   std::vector<AnfNodePtr> compute_nodes;
   std::vector<AnfNodePtr> output_nodes;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc
index 6c845e6fff..482ad2fb42 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc
@@ -17,6 +17,7 @@
 #include "backend/kernel_compiler/tbe/tbe_kernel_build.h"
 #include <memory>
 #include <map>
+#include <list>
 #include <algorithm>
 #include "base/core_ops.h"
 #include "frontend/parallel/ops_info/ops_utils.h"
@@ -93,9 +94,13 @@ constexpr auto kJPattern = "pattern";
 constexpr auto kJPyModulePath = "py_module_path";
 constexpr auto kJAttrDesc = "attr_desc";
 constexpr auto kJSocVersion = "socVersion";
+constexpr auto kAutoTilingMode = "autoTilingMode";
 constexpr auto kSOC_VERSION = "SOC_VERSION";
 constexpr auto kJIsDynamicShape = "is_dynamic_shape";
 constexpr auto kJDynamicIndex = "dynamic_index";
+constexpr auto kJSocInfo = "SocInfo";
+
+const auto kPyPath = "/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe";
 
 bool IsNeedChangeDefaultFormat(const CNodePtr &cnode) {
   MS_EXCEPTION_IF_NULL(cnode);
@@ -114,11 +119,14 @@ bool TbeKernelJsonCreator::GenTbeSingleKernelJson(const std::shared_ptr<mindspor
   auto op_info_ptr = mindspore::kernel::tbe::TbeDynamicShapeUtil::FindOp(op_name, anf_node);
   MS_EXCEPTION_IF_NULL(op_info_ptr);
   (*kernel_json)[kPlatform] = kPlatTBE;
-  (*kernel_json)[kGenModel] = kSingle;
   (*kernel_json)[kImplPath] = op_info_ptr->impl_path();
   nlohmann::json op_info_json;
   op_info_json[kJIsDynamicShape] = tbe::TbeDynamicShapeUtil::GetDynamicShapeAttr(anf_node->cast<CNodePtr>());
-  op_info_json[kJName] = op_info_ptr->kernel_name();
+  auto func_name = op_info_ptr->kernel_name();
+  op_info_json["graph_id"] = AnfAlgo::GetGraphId(anf_node.get());
+  op_info_json[kJName] = func_name;
+  op_info_json[kJModuleName] = std::string("impl.") + func_name;
+  op_info_json[kJPyModulePath] = kPyPath;
   // generate inputs json
   nlohmann::json inputs_json;
   if (!GenTbeInputsJson(anf_node, op_info_ptr, &inputs_json)) {
@@ -148,11 +156,33 @@ bool TbeKernelJsonCreator::GenTbeSingleKernelJson(const std::shared_ptr<mindspor
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
   auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+  auto tune_mode = context_ptr->get_param<std::string>(MS_CTX_TUNE_MODE);
+  op_info_json[kJFullName] = anf_node->fullname_with_scope();
   json_name_ = op_name + "_" + std::to_string(hash_id) + "_" + std::to_string(device_id);
   json_info_ = json_str;
+  op_info_json["Type"] = op_name;
   op_info_json[kJKernelName] = json_name_;
+  op_info_json[kGenModel] = kSingle;
+  op_info_json[kJFullName] = anf_node->fullname_with_scope();
+
+  // create attr_desc
+  nlohmann::json attr_desc;
+  for (const auto &attr : attrs_json) {
+    if (attr[kJName] != "isRef" && attr[kJValid] == true) {
+      attr_desc.push_back(attr[kJValue]);
+    }
+  }
+  if (!attr_desc.empty()) {
+    op_info_json[kJAttrDesc] = attr_desc;
+  }
+
+  // generate soc info json
+  nlohmann::json soc_info_json;
+  TbeUtils::GenSocInfo(&soc_info_json);
+  soc_info_json[kAutoTilingMode] = tune_mode;
+  soc_info_json[kJSocVersion] = soc_version;
+  (*kernel_json)[kJSocInfo] = soc_info_json;
   (*kernel_json)[kJOpInfo] = op_info_json;
-  (*kernel_json)[kJFullName] = anf_node->fullname_with_scope();
 
   MS_LOG(DEBUG) << "Operate type:" << creater_type_ << ", full scope name is :" << anf_node->fullname_with_scope()
                 << ", json info name is : " << json_name_ << ", kernel json:" << kernel_json->dump();
@@ -452,14 +482,22 @@ bool TbeKernelJsonCreator::GenTbeAttrJson(const std::shared_ptr<AnfNode> &anf_no
       ParseAttrValue(type, value, &attr_obj);
       attr_obj[kJValid] = true;
     } else {
-      if (op_info->impl_path().empty()) {
-        attr_obj[kJValid] = false;
+      auto default_value = attr_ptr->default_value();
+      if (!default_value.empty()) {
+        std::string type = attr_ptr->type();
+        ParseAttrDefaultValue(type, default_value, &attr_obj);
+        attr_obj[kJValid] = true;
       } else {
-        if (attr_ptr->param_type() == kParamRequred && creater_type_ == SINGLE_BUILD) {
-          MS_LOG(EXCEPTION) << "Op name: " << op_info->op_name() << " attr: " << attr_name
-                            << " is required, but not set.";
-        } else {
+        MS_LOG(INFO) << "op " << op_name << "'s attr \"" << attr_name << "\" should have a default value.";
+        if (op_info->impl_path().empty()) {
           attr_obj[kJValid] = false;
+        } else {
+          if (attr_ptr->param_type() == kParamRequred && creater_type_ == SINGLE_BUILD) {
+            MS_LOG(EXCEPTION) << "Op name: " << op_info->op_name() << " attr: " << attr_name
+                              << " is required, but not set.";
+          } else {
+            attr_obj[kJValid] = false;
+          }
         }
       }
     }
@@ -567,6 +605,26 @@ void TbeKernelJsonCreator::ParseAttrValue(const std::string &type, const mindspo
   }
 }
 
+void TbeKernelJsonCreator::ParseAttrDefaultValue(const std::string &type, const std::string &value,
+                                                 nlohmann::json *attr_obj) {
+  MS_EXCEPTION_IF_NULL(attr_obj);
+  if (type == kVTypeInt) {
+    (*attr_obj)[kJValue] = std::stoi(value);
+  } else if (type == kVTypeInt64) {
+    (*attr_obj)[kJValue] = std::stoll(value);
+  } else if (type == kVTypeStr) {
+    (*attr_obj)[kJValue] = value;
+  } else if (type == kVTypeBool) {
+    bool attr_value;
+    std::istringstream(value) >> std::boolalpha >> attr_value;
+    (*attr_obj)[kJValue] = attr_value;
+  } else if (type == kVTypeFloat) {
+    (*attr_obj)[kJValue] = std::stof(value);
+  } else {
+    MS_LOG(EXCEPTION) << "Type: " << type << "not support";
+  }
+}
+
 std::vector<size_t> TbeKernelJsonCreator::GetDeviceInputShape(const AnfNodePtr &anf_node, size_t real_index) const {
   MS_EXCEPTION_IF_NULL(anf_node);
   std::vector<size_t> shape;
@@ -792,7 +850,7 @@ void TbeKernelBuild::GenFusionComputeCommonJson(const mindspore::CNodePtr &cnode
   (*compute_op_str)[kJModuleName] = std::string("impl.") + func_name;
   (*compute_op_str)[kJName] = cnode->fullname_with_scope();
   (*compute_op_str)[kJPattern] = GetNodeFusionType(cnode);
-  (*compute_op_str)[kJPyModulePath] = "/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe";
+  (*compute_op_str)[kJPyModulePath] = kPyPath;
   (void)(*fusion_kernel_name).append("_");
   (void)(*fusion_kernel_name).append(func_name);
   // attr_desc
@@ -899,12 +957,14 @@ void TbeKernelBuild::GenFusionOutputDescJson(const std::shared_ptr<mindspore::An
 }
 
 void TbeKernelBuild::GenReusedOutputDesc(const std::shared_ptr<mindspore::AnfNode> &anf_node, size_t index,
-                                         size_t output_index, nlohmann::json *output_desc) {
+                                         size_t output_index, nlohmann::json *output_desc, const size_t out_size) {
   std::string output_desc_name = anf_node->fullname_with_scope() + "_" + std::to_string(index);
   (*output_desc)[kJName] = output_desc_name;
   (*output_desc)[kJOutputIndex] = output_index;
   std::vector<size_t> shape;
   (*output_desc)[kJShape] = shape;
+  auto type_id = AnfAlgo::GetOutputDeviceDataType(anf_node, out_size - 1);
+  (*output_desc)[kJDataType] = tbe::TypeIdToString(type_id);
 }
 
 bool TbeKernelBuild::GetSpecInputLayers(const std::string &op_name,
@@ -1176,7 +1236,7 @@ bool TbeKernelBuild::GenFusionComputeOutputJson(const mindspore::CNodePtr &cnode
     for (size_t j = output_size; j < desc_output_index.size(); ++j) {
       MS_LOG(INFO) << "Fusion index: " << j << ", desc_output_index: " << desc_output_index[j];
       nlohmann::json output_desc;
-      GenReusedOutputDesc(cnode, j, desc_output_index[j], &output_desc);
+      GenReusedOutputDesc(cnode, j, desc_output_index[j], &output_desc, output_size);
       output_desc_list->emplace_back(output_desc);
     }
   } else {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.h b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.h
index f0bec0b61a..99b7504254 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.h
@@ -73,7 +73,7 @@ class TbeKernelBuild {
                                       nlohmann::json *output_data_desc);
   static void GenSuffixDescJson(nlohmann::json *output_desc);
   static void GenReusedOutputDesc(const std::shared_ptr<mindspore::AnfNode> &anf_node, size_t index,
-                                  size_t output_index, nlohmann::json *output_desc);
+                                  size_t output_index, nlohmann::json *output_desc, const size_t out_size);
   static size_t GetIOSizeImpl(const nlohmann::json &desc);
   static bool GetSpecInputLayers(const std::string &op_name, const std::vector<mindspore::AnfNodePtr> &reorder_layer,
                                  std::map<const AnfNodePtr, FusionDataType> *spec_data_input);
@@ -102,7 +102,9 @@ class TbeKernelJsonCreator {
                         nlohmann::json *inputs_json);
   bool GenTbeOutputsJson(const std::shared_ptr<AnfNode> &anf_node, const std::shared_ptr<OpInfo> &op_info,
                          nlohmann::json *outputs_json);
+  void GenSocInfo(nlohmann::json *soc_info_json);
   static void ParseAttrValue(const std::string &type, const ValuePtr &value, nlohmann::json *attr_obj);
+  static void ParseAttrDefaultValue(const std::string &type, const std::string &value, nlohmann::json *attr_obj);
   bool GenInputDescJson(const std::shared_ptr<AnfNode> &anf_node, size_t real_input_index, bool value,
                         const std::shared_ptr<OpIOInfo> &input_ptr, const string &op_input_name, size_t input_i,
                         std::vector<nlohmann::json> *input_list);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.cc
index 23a4c2019d..71e032118b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_parallel_build.cc
@@ -37,6 +37,20 @@ bool TbeOpParallelBuild(const std::vector<AnfNodePtr> &anf_nodes) {
   auto build_manger = std::make_shared<ParallelBuildManager>();
   MS_EXCEPTION_IF_NULL(build_manger);
   set<std::string> processed_kernel;
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  auto tune_mode = context_ptr->get_param<std::string>(MS_CTX_TUNE_MODE);
+  std::string offline_tune = common::GetEnv("ENABLE_TUNE_DUMP");
+  if (!offline_tune.empty()) {
+    for (size_t j = 0; j < offline_tune.length(); j++) {
+      offline_tune[j] = tolower(offline_tune[j]);
+    }
+    if (!(offline_tune == "true" || offline_tune == "false")) {
+      MS_LOG(ERROR) << "The value of ENABLE_TUNE_DUMP must be 'true' or 'false'";
+      return false;
+    }
+  }
+
   for (const auto &anf_node : anf_nodes) {
     // gen kernel json
     if (AnfAlgo::GetKernelMod(anf_node) != nullptr) {
@@ -56,7 +70,8 @@ bool TbeOpParallelBuild(const std::vector<AnfNodePtr> &anf_nodes) {
     (void)TbeKernelBuild::GetIOSize(kernel_json, &input_size_list, &output_size_list, anf_node);
     // search cache
     const std::string &json_name = creator.json_name();
-    if (build_manger->SearchInCache(json_name, processor, input_size_list, output_size_list, anf_node.get())) {
+    if (build_manger->SearchInCache(json_name, processor, input_size_list, output_size_list, anf_node.get()) &&
+        ((!offline_tune.empty() && offline_tune != "true") || tune_mode == "NO_TUNE")) {
       continue;
     }
     // same op not need build, but need wait build finish to set kernel mode
@@ -227,7 +242,8 @@ KernelModPtr ParallelBuildManager::GenKernelMod(const string &json_name, const s
 }
 
 int ParallelBuildManager::StartCompileOp(const nlohmann::json &kernel_json) {
-  return AscendKernelBuildClient::Instance().TbeStart(kernel_json.dump());
+  auto tune_mode = kernel_json["SocInfo"]["autoTilingMode"];
+  return AscendKernelBuildClient::Instance().TbeStart(kernel_json.dump(), tune_mode);
 }
 
 bool ParallelBuildManager::WaitOne(int *task_id, std::string *task_result, std::string *pre_build_result) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_utils.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_utils.cc
index 527f92e7e2..7680ce1087 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_utils.cc
@@ -19,6 +19,8 @@
 #include <dirent.h>
 #include <string>
 #include <map>
+#include <set>
+#include <list>
 #include <functional>
 #include <iostream>
 #include <fstream>
@@ -26,7 +28,9 @@
 #include "runtime/kernel.h"
 #include "utils/utils.h"
 #include "utils/ms_utils.h"
+#include "utils/ms_context.h"
 #include "ir/dtype/type.h"
+#include "backend/session/anf_runtime_algorithm.h"
 #include "backend/kernel_compiler/tbe/tbe_convert_utils.h"
 #include "securec/include/securec.h"
 
@@ -40,6 +44,19 @@ constexpr auto kInfoSuffix = ".info";
 uintptr_t KernelManager::kernel_stub_gen_ = 0;
 std::unordered_map<string, KernelMetaPtr> KernelManager::info_table_ = {};
 
+void TbeUtils::GenSocInfo(nlohmann::json *soc_info_json) {
+  MS_EXCEPTION_IF_NULL(soc_info_json);
+  std::list<int64_t> list;
+  (*soc_info_json)["coreNum"] = "";
+  (*soc_info_json)["coreType"] = "";
+  (*soc_info_json)["l1Fusion"] = "false";
+  (*soc_info_json)["l2Fusion"] = "false";
+  (*soc_info_json)["l2Mode"] = "2";
+  (*soc_info_json)["op_debug_level"] = "";
+  (*soc_info_json)["op_impl_mode"] = "";
+  (*soc_info_json)["op_impl_mode_list"] = list;
+}
+
 void TbeUtils::SaveJsonInfo(const std::string &json_name, const std::string &info) {
   char real_path[PATH_MAX] = {0};
   std::string path = kCceKernelMeta + json_name + kInfoSuffix;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_utils.h b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_utils.h
index 7d3f639b5e..e367e9d82e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_utils.h
@@ -22,6 +22,7 @@
 #include <utility>
 #include <map>
 #include <unordered_map>
+#include <nlohmann/json.hpp>
 
 #include "backend/session/kernel_graph.h"
 #include "ir/anf.h"
@@ -43,6 +44,8 @@ class TbeUtils {
 
   static void LoadCache();
 
+  static void GenSocInfo(nlohmann::json *soc_info_json);
+
   static KernelPackPtr SearchCache(const std::string &kernel_name, const std::string &processor);
 
   static KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &processor);
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/fusion_base_pass.h b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/fusion_base_pass.h
index ff86b1c1e7..9013d006db 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/fusion_base_pass.h
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/fusion_base_pass.h
@@ -43,6 +43,7 @@ const int8_t MULTI_ELTWISE_SIZE = 4;
 using FusedNodeRecord = std::vector<std::unordered_set<AnfNodePtr>>;
 
 struct BufferFusionInfo_t {
+  uint32_t graph_id;
   std::vector<AnfNodePtr> anf_nodes;
   std::vector<AnfNodePtr> inputs_list;
   std::vector<AnfNodePtr> outputs_list;
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/ub_pattern_fusion.cc b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/ub_pattern_fusion.cc
index 122d5a3109..1db09fa869 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/ub_pattern_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/buffer_fusion/ub_pattern_fusion.cc
@@ -381,6 +381,7 @@ void RemoveCircle(const session::KernelGraph &kernel_graph,
 void UbPatternFusion::GetBufferFusionInfo(session::KernelGraph *kernel_graph,
                                           std::unordered_map<int64_t, BufferFusionInfo_t> *buffer_fusion_infos) const {
   MS_EXCEPTION_IF_NULL(buffer_fusion_infos);
+  auto graph_id = kernel_graph->graph_id();
   GetFusionScopeComputeNodeList(kernel_graph, buffer_fusion_infos);
   GetFusionScopeInputNodeList(*kernel_graph, buffer_fusion_infos);
   GetFusionScopeOutputNodeList(kernel_graph, buffer_fusion_infos);
@@ -390,6 +391,7 @@ void UbPatternFusion::GetBufferFusionInfo(session::KernelGraph *kernel_graph,
   for (auto &buffer_fusion_info : *buffer_fusion_infos) {
     buffer_fusion_info.second.kernel_build_info =
       CreateFusionOpKernelInfo(buffer_fusion_info.second.inputs_list, buffer_fusion_info.second.outputs_list);
+    buffer_fusion_info.second.graph_id = graph_id;
   }
 }
 
@@ -403,9 +405,9 @@ bool UbPatternFusion::FuseBufferFusionPattern(session::KernelGraph *kernel_graph
   std::transform(
     buffer_fusion_infos.begin(), buffer_fusion_infos.end(), std::back_inserter(fusion_scope_infos),
     [](const std::pair<int64_t, BufferFusionInfo_t> &buffer_fusion_info) -> mindspore::kernel::FusionScopeInfo {
-      return mindspore::kernel::FusionScopeInfo(buffer_fusion_info.first, buffer_fusion_info.second.inputs_list,
-                                                buffer_fusion_info.second.anf_nodes,
-                                                buffer_fusion_info.second.outputs_list);
+      return mindspore::kernel::FusionScopeInfo(
+        buffer_fusion_info.first, buffer_fusion_info.second.graph_id, buffer_fusion_info.second.inputs_list,
+        buffer_fusion_info.second.anf_nodes, buffer_fusion_info.second.outputs_list);
     });
   auto kernel_mods = mindspore::kernel::KernelFusion(fusion_scope_infos);
   std::set<int64_t> fusion_ids;
diff --git a/mindspore/ccsrc/backend/session/kernel_build_client.cc b/mindspore/ccsrc/backend/session/kernel_build_client.cc
index 1010c71916..f633447128 100644
--- a/mindspore/ccsrc/backend/session/kernel_build_client.cc
+++ b/mindspore/ccsrc/backend/session/kernel_build_client.cc
@@ -28,18 +28,28 @@ void ReplaceStr(std::string *dest, const std::string &replace, char new_char) {
   }
 }
 
-bool AscendKernelBuildClient::TbePre() {
+bool AscendKernelBuildClient::TbePre(const std::string &mode) {
   auto res = SendRequest(kTbePre);
   if (res.find(kSuccess) == res.npos) {
     MS_LOG(EXCEPTION) << "PRE failed, res: " << res;
   }
   MS_LOG(INFO) << "Pre " << res;
+  // init env for auto tune
+  res = SendRequest(kTbeTune);
+  if (res != kAck) {
+    MS_LOG(EXCEPTION) << "Send tune single failed, res: " << res;
+  }
+  res = SendRequest(mode);
+  if (res != kSuccess) {
+    MS_LOG(EXCEPTION) << "PRE failed, res: " << res;
+  }
+
   return true;
 }
 
-int AscendKernelBuildClient::TbeStart(const std::string &json) {
+int AscendKernelBuildClient::TbeStart(const std::string &json, const std::string &mode) {
   if (!init_flag) {
-    if (!TbePre()) {
+    if (!TbePre(mode)) {
       MS_LOG(EXCEPTION) << "START failed";
     }
     init_flag = true;
diff --git a/mindspore/ccsrc/backend/session/kernel_build_client.h b/mindspore/ccsrc/backend/session/kernel_build_client.h
index e10932e6f4..f24c8676ea 100644
--- a/mindspore/ccsrc/backend/session/kernel_build_client.h
+++ b/mindspore/ccsrc/backend/session/kernel_build_client.h
@@ -200,6 +200,7 @@ class AscendKernelBuildClient : public KernelBuildClient {
   constexpr inline static auto kAkgStart = "AKG/START";
   constexpr inline static auto kAkgData = "AKG/DATA";
   constexpr inline static auto kAkgWait = "AKG/WAIT";
+  constexpr inline static auto kTbeTune = "TBE/TUNE";
 
   // Send server info. query to server
   constexpr inline static auto kFormat = "FORMAT";
@@ -222,7 +223,7 @@ class AscendKernelBuildClient : public KernelBuildClient {
   bool CheckSupported(const std::string &json);
 
   // Run TBE building.
-  int TbeStart(const std::string &json);
+  int TbeStart(const std::string &json, const std::string &mode);
   bool TbeWait(int *task_id, std::string *task_result, std::string *pre_build_result);
   void TbeReset();
 
@@ -239,7 +240,7 @@ class AscendKernelBuildClient : public KernelBuildClient {
   AscendKernelBuildClient &operator=(AscendKernelBuildClient &&) = delete;
 
  private:
-  bool TbePre();
+  bool TbePre(const std::string &mode);
   AscendKernelBuildClient() { Open(); }
   ~AscendKernelBuildClient() override { Close(); }
 };
diff --git a/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc b/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc
index 35f696e243..efd8f46767 100644
--- a/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc
+++ b/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc
@@ -94,6 +94,7 @@ REGISTER_PYBIND_DEFINE(MsContextPy, ([](const py::module *m) {
                            .value("save_graphs_path", MsCtxParam::MS_CTX_SAVE_GRAPHS_PATH)
                            .value("variable_memory_max_size", MsCtxParam::MS_CTX_VARIABLE_MEMORY_MAX_SIZE)
                            .value("device_id", MsCtxParam::MS_CTX_DEVICE_ID)
+                           .value("tune_mode", MsCtxParam::MS_CTX_TUNE_MODE)
                            .value("max_call_depth", MsCtxParam::MS_CTX_MAX_CALL_DEPTH)
                            .value("env_config_path", MsCtxParam::MS_CTX_ENV_CONFIG_PATH)
                            .value("grad_for_scalar", MsCtxParam::MS_CTX_GRAD_FOR_SCALAR);
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
index e96cadb4a9..7d3bc87372 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
@@ -262,6 +262,13 @@ nlohmann::json ConstructTransDataKernelJson(const std::vector<size_t> &host_shap
   op_info[kernel_name_str] = "";
   op_info[name] = trans_data;
   op_info[outputs_str] = ConstructOutputs(host_shape, type);
+  // construct soc_info
+  nlohmann::json soc_info;
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  auto tune_mode = ms_context->get_param<std::string>(MS_CTX_TUNE_MODE);
+  soc_info["autoTilingMode"] = tune_mode;
+  kernel_json["SocInfo"] = soc_info;
   kernel_json[op_info_str] = op_info;
   kernel_json[platform_str] = platform_tbe;
   std::string json_str = kernel_json[op_info_str].dump();
diff --git a/mindspore/ccsrc/utils/utils.h b/mindspore/ccsrc/utils/utils.h
index e24655fa07..9eb19b52e6 100644
--- a/mindspore/ccsrc/utils/utils.h
+++ b/mindspore/ccsrc/utils/utils.h
@@ -36,6 +36,9 @@ constexpr auto kComputeAccidentalHitsOpName = "ComputeAccidentalHits";
 constexpr auto kCTCGreedyDecoderOpName = "CTCGreedyDecoder";
 constexpr auto kFour2FiveOpName = "Four2Five";
 constexpr auto kFive2FourOpName = "Five2Four";
+constexpr auto kConv3DOpName = "Conv3D";
+constexpr auto kConv3DBackpropFilterOpName = "Conv3DBackpropFilter";
+constexpr auto kConv3DBackpropInputOpName = "Conv3DBackpropInput";
 constexpr auto kConv2DOpName = "Conv2D";
 constexpr auto kConvBN1OpName = "ConvBN1";
 constexpr auto kBN2AddReluOpName = "BN2AddRelu";
diff --git a/mindspore/context.py b/mindspore/context.py
index e15a8f4cfc..3a4863eb79 100644
--- a/mindspore/context.py
+++ b/mindspore/context.py
@@ -204,6 +204,13 @@ class _Context:
         if self.enable_debug_runtime and target == "CPU":
             self.set_backend_policy("vm")
 
+    def set_auto_tune_mode(self, tune_mode):
+        candidate = ["NO_TUNE", "RL", "GA", "RL,GA", "GA,RL"]
+        if tune_mode in candidate:
+            self.set_param(ms_ctx_param.tune_mode, tune_mode)
+        else:
+            raise ValueError(f"Tune mode must be in ['NO_TUNE', 'RL', 'GA', 'RL,GA', 'GA,RL'], but got {tune_mode}")
+
     def set_device_id(self, device_id):
         if device_id < 0 or device_id > 4095:
             raise ValueError(f"Device id must be in [0, 4095], but got {device_id}")
@@ -276,6 +283,7 @@ class _Context:
         'save_graphs_path': set_save_graphs_path,
         'device_target': set_device_target,
         'device_id': set_device_id,
+        'auto_tune_mode': set_auto_tune_mode,
         'max_call_depth': set_max_call_depth,
         'profiling_options': set_profiling_options,
         'variable_memory_max_size': set_variable_memory_max_size,
@@ -480,6 +488,7 @@ def _check_target_specific_cfgs(device, arg_key):
         'profiling_options': ['Ascend'],
         'print_file_path': ['Ascend'],
         'variable_memory_max_size': ['Ascend'],
+        'auto_tune_mode': ['Ascend'],
         'max_device_memory': ['GPU']
     }
     # configs not in map device_cfgs are supposed to be suitable for all devices
@@ -494,7 +503,7 @@ def _check_target_specific_cfgs(device, arg_key):
 
 
 @args_type_check(mode=int, precompile_only=bool, device_target=str, device_id=int, save_graphs=bool,
-                 save_graphs_path=str, enable_dump=bool,
+                 save_graphs_path=str, enable_dump=bool, auto_tune_mode=str,
                  save_dump_path=str, enable_reduce_precision=bool, variable_memory_max_size=str,
                  enable_profiling=bool, profiling_options=str, enable_auto_mixed_precision=bool,
                  enable_graph_kernel=bool, check_bprop=bool, max_device_memory=str, print_file_path=str,
@@ -531,7 +540,7 @@ def set_context(**kwargs):
     mode                         enable_profiling
     reserve_class_name_in_scope  profiling_options
     save_graphs                  variable_memory_max_size
-    save_graphs_path
+    save_graphs_path             auto_tune_mode
     env_config_path
     grad_for_scalar
     ===========================  ===========================  =================
@@ -603,6 +612,13 @@ def set_context(**kwargs):
         enable_sparse (bool): Whether to enable sparsity feature. Default: False.
         max_call_depth (int): Specify the maximum depth of function call. Default: 1000.
         env_config_path (str): Config path for DFX.
+        auto_tune_mode (str): The mode of auto tune when op building, get the best tiling performance,
+            default: NO_TUNE. The value must be in ['RL', 'GA', 'RL,GA'].
+            RL: rl_tune;
+            GA: ga_tune;
+            RL,GA: rl_tune/ga_tune(Automatic selection).
+            - rl_tune: Reinforecement Learning tune.
+            - ga_tune: Genetic Algorithm tune.
         grad_for_scalar (bool): Whether to get gradient for scalar. Default: False.
 
     Raises:
diff --git a/mindspore/core/utils/ms_context.cc b/mindspore/core/utils/ms_context.cc
index 54c6dba837..b916ea2b05 100644
--- a/mindspore/core/utils/ms_context.cc
+++ b/mindspore/core/utils/ms_context.cc
@@ -38,8 +38,10 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
   set_param<bool>(MS_CTX_ENABLE_DUMP, false);
   set_param<std::string>(MS_CTX_SAVE_DUMP_PATH, ".");
   set_param<std::string>(MS_CTX_ENV_CONFIG_PATH, "");
+  set_param<std::string>(MS_CTX_TUNE_MODE, "NO_TUNE");
   set_param<uint32_t>(MS_CTX_TSD_REF, 0);
   set_param<uint32_t>(MS_CTX_GE_REF, 0);
+
   set_param<bool>(MS_CTX_IS_MULTI_GRAPH_SINK, false);
   set_param<bool>(MS_CTX_IS_PYNATIVE_GE_INIT, false);
   set_param<bool>(MS_CTX_ENABLE_REDUCE_PRECISION, true);
diff --git a/mindspore/core/utils/ms_context.h b/mindspore/core/utils/ms_context.h
index 11c81992fb..30ac0f88d8 100644
--- a/mindspore/core/utils/ms_context.h
+++ b/mindspore/core/utils/ms_context.h
@@ -108,6 +108,7 @@ enum MsCtxParam : unsigned {
   MS_CTX_VARIABLE_MEMORY_MAX_SIZE,
   MS_CTX_PYTHON_EXE_PATH,
   MS_CTX_ENV_CONFIG_PATH,
+  MS_CTX_TUNE_MODE,
   MS_CTX_TYPE_STRING_END,
 
   // parameter numbers of each type
diff --git a/mindspore/ops/_op_impl/tbe/conv2d.py b/mindspore/ops/_op_impl/tbe/conv2d.py
index f262eb9b9d..99a576ce75 100644
--- a/mindspore/ops/_op_impl/tbe/conv2d.py
+++ b/mindspore/ops/_op_impl/tbe/conv2d.py
@@ -29,6 +29,7 @@ conv2d_op_info = TBERegOp("Conv2D") \
     .attr("dilation", "required", "listInt", "all") \
     .attr("groups", "optional", "int", "all") \
     .attr("format", "optional", "str", "all") \
+    .attr("offset_x", "optional", "int", "all", "0") \
     .input(0, "x", False, "required", "all") \
     .input(1, "filter", False, "required", "all") \
     .input(2, "bias", False, "optional", "all") \
diff --git a/mindspore/ops/_op_impl/tbe/conv3d.py b/mindspore/ops/_op_impl/tbe/conv3d.py
index d3b4ced407..70bac6a1a1 100644
--- a/mindspore/ops/_op_impl/tbe/conv3d.py
+++ b/mindspore/ops/_op_impl/tbe/conv3d.py
@@ -28,7 +28,7 @@ conv3d_op_info = TBERegOp("Conv3D") \
     .attr("dilations", "required", "listInt", "all") \
     .attr("groups", "optional", "int", "all") \
     .attr("format", "optional", "str", "all") \
-    .attr("offset_x", "optional", "int", "all") \
+    .attr("offset_x", "optional", "int", "all", "0") \
     .input(0, "x", False, "required", "all") \
     .input(1, "filter", False, "required", "all") \
     .input(2, "bias", False, "optional", "all") \
diff --git a/mindspore/ops/_op_impl/tbe/depthwise_conv2d.py b/mindspore/ops/_op_impl/tbe/depthwise_conv2d.py
index d7ddce72ec..d7d1249eef 100644
--- a/mindspore/ops/_op_impl/tbe/depthwise_conv2d.py
+++ b/mindspore/ops/_op_impl/tbe/depthwise_conv2d.py
@@ -27,7 +27,7 @@ depthwise_conv2d_op_info = TBERegOp("DepthwiseConv2dNative") \
     .attr("dilation", "required", "listInt", "all") \
     .attr("pad_list", "required", "listInt", "all") \
     .attr("format", "required", "str", "all") \
-    .attr("offset_a", "optional", "int", "all") \
+    .attr("offset_a", "optional", "int", "all", "0") \
     .input(0, "x", False, "required", "all") \
     .input(1, "filter", False, "required", "all") \
     .input(2, "bias", False, "optional", "all") \