From d9c4ef59febc7bbe139361fe7961ce78dec8a80e Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Thu, 24 Mar 2022 19:05:33 +0800
Subject: [PATCH] perf(imperative): using simple hash key in heuristic cache

GitOrigin-RevId: 6fddd612e7cc193a140a401fd2a62a98a5056b1d
---
 dnn/include/megdnn/heuristic_cache.h        |  20 ++--
 dnn/src/common/heuristic_cache.cpp          | 107 ++++++++++++--------
 imperative/src/impl/algo_chooser.h          |   5 +-
 imperative/src/impl/ops/convolution.cpp     |  23 ++---
 src/opr/impl/search_policy/algo_chooser.cpp |   4 +-
 5 files changed, 91 insertions(+), 68 deletions(-)
diff --git a/dnn/include/megdnn/heuristic_cache.h b/dnn/include/megdnn/heuristic_cache.h
index f8daf65a..fbf24f86 100644
--- a/dnn/include/megdnn/heuristic_cache.h
+++ b/dnn/include/megdnn/heuristic_cache.h
@@ -29,15 +29,12 @@ public:
     MGE_WIN_DECLSPEC_FUC static HeuristicCache& instance();
 
     struct KeyStorage {
-        std::string category;
-        std::string input;
+        size_t k1, k2;
 
-        bool operator==(const KeyStorage& k) const {
-            return category == k.category && input == k.input;
-        }
+        bool operator==(const KeyStorage& k) const { return k1 == k.k1 && k2 == k.k2; }
     };
 
-    class Key {
+    struct Key {
         Handle* m_handle;
         uint32_t m_opr_type;
         const TensorLayout* m_inp_layouts_ptr;
@@ -45,8 +42,7 @@ public:
         const void* m_param_ptr;
         size_t m_param_size;
 
-        mutable std::string m_category;
-        mutable std::string m_input;
+        mutable SmallVector<size_t> m_buf;
 
     public:
         Key(Handle* opr_handle, Algorithm::OprType opr_type,
@@ -65,6 +61,10 @@ public:
     struct Result {
         ExecutionPolicy policy;
         size_t workspace;
+
+        // for cache collision
+        SmallVector<size_t> m_buf;
+        SmallVector<char> m_param_buf;
     };
 
     MGE_WIN_DECLSPEC_FUC void put(const Key& key, Result& result);
@@ -76,8 +76,8 @@ public:
 private:
     struct Hash {
         size_t operator()(const KeyStorage& k) const {
-            size_t h1 = std::hash<std::string>{}(k.category);
-            size_t h2 = std::hash<std::string>{}(k.input);
+            size_t h1 = k.k1;
+            size_t h2 = k.k2;
             h1 ^= h2 + 0x9e3779b9 + (h1 << 6) + (h1 >> 2);
             return h1;
         }
diff --git a/dnn/src/common/heuristic_cache.cpp b/dnn/src/common/heuristic_cache.cpp
index 0d6296bf..189b289f 100644
--- a/dnn/src/common/heuristic_cache.cpp
+++ b/dnn/src/common/heuristic_cache.cpp
@@ -11,6 +11,8 @@
  */
 
 #include "megdnn/heuristic_cache.h"
+#include "megdnn/tensor_format.h"
+#include "src/common/hash_ct.h"
 #include "src/common/utils.h"
 #include "src/naive/handle.h"
 
@@ -32,38 +34,27 @@ HeuristicCache& HeuristicCache::instance() {
 }
 
 HeuristicCache::KeyStorage HeuristicCache::Key::build_key_storage() const {
-    auto&& ctg = m_category;
-    auto&& inp = m_input;
+    size_t buf_size = 16 * m_inp_layouts_size + 6;
+    size_t buf[buf_size];
 
-    if (!m_category.empty() && !m_input.empty())
-        return {ctg, inp};
-
-    inp.reserve(sizeof(TensorLayout) * 3 * m_inp_layouts_size + m_param_size);
+    size_t pos = 0;
     for (size_t i = 0; i < m_inp_layouts_size; i++) {
-        auto&& ly = m_inp_layouts_ptr[i];
-        for (size_t j = 0; j < ly.ndim; j++) {
-            if (j)
-                inp.push_back(',');
-            inp.append(std::to_string(ly.shape[j]));
+        auto&& layout = m_inp_layouts_ptr[i];
+        if (layout.dtype.valid()) {
+            buf[pos++] = static_cast<size_t>(layout.dtype.enumv());
+        } else {
+            buf[pos++] = static_cast<size_t>(SIZE_MAX);
         }
-        inp.push_back(';');
-        for (size_t j = 0; j < ly.ndim; j++) {
-            if (j)
-                inp.push_back(',');
-            inp.append(std::to_string(ly.stride[j]));
+        buf[pos++] = static_cast<size_t>(layout.format.type());
+        for (size_t j = 0; j < layout.ndim; j++) {
+            buf[pos++] = layout.shape[j];
+            buf[pos++] = layout.stride[j];
         }
-        inp.push_back(';');
-        inp.append(ly.dtype.name());
-        inp.push_back(';');
-        inp.append(ly.format.to_string().c_str());
-        inp.push_back('|');
-    }
-    if (m_param_size) {
-        inp.append(reinterpret_cast<const char*>(m_param_ptr), m_param_size);
     }
 
-    ctg = "plat:";
-    ctg.append(std::to_string(static_cast<uint32_t>(m_handle->type())));
+    buf[pos++] = m_opr_type;
+    buf[pos++] = static_cast<size_t>(m_handle->type());
+
     switch (m_handle->type()) {
 #if MEGDNN_WITH_CUDA
         case Handle::HandleType::CUDA: {
@@ -72,9 +63,9 @@ HeuristicCache::KeyStorage HeuristicCache::Key::build_key_storage() const {
             cuda_rt /= 1000;
             auto&& handle = static_cast<megdnn::cuda::HandleImpl*>(m_handle);
             auto&& prop = handle->device_prop();
-            ctg.append(ssprintf(
-                    ";dev=%s;cap=%d.%d;runtime=%d;", prop.name, prop.major, prop.minor,
-                    cuda_rt));
+            buf[pos++] = prop.major;
+            buf[pos++] = prop.minor;
+            buf[pos++] = cuda_rt;
             break;
         }
 #endif
@@ -85,9 +76,10 @@ HeuristicCache::KeyStorage HeuristicCache::Key::build_key_storage() const {
             int drv = -1, hip_rt = -1;
             hip_check(hipDriverGetVersion(&drv));
             hip_check(hipRuntimeGetVersion(&hip_rt));
-            ctg.append(ssprintf(
-                    ";dev=%s;cap=%d.%d,drv=%d;runtime=%d;", prop.name, prop.major,
-                    prop.minor, drv, hip_rt));
+            buf[pos++] = prop.major;
+            buf[pos++] = prop.minor;
+            buf[pos++] = drv;
+            buf[pos++] = hip_rt;
             break;
         }
 #endif
@@ -108,16 +100,21 @@ HeuristicCache::KeyStorage HeuristicCache::Key::build_key_storage() const {
             size_t nr_threads = static_cast<megdnn::naive::HandleImpl*>(m_handle)
                                         ->megcore_dispatcher()
                                         ->nr_threads();
-            ctg.append(";");
-            ctg.append(std::to_string(nr_threads));
-            ctg.append(";");
+            buf[pos++] = nr_threads;
             break;
         }
         default:
-            ctg.append(";");
+            break;
     }
-    ctg.append(std::to_string(m_opr_type));
-    return {ctg, inp};
+
+    m_buf.resize(pos);
+    SmallVector<size_t> tmp(buf, buf + pos);
+    m_buf = std::move(tmp);
+
+    size_t k1 = XXHash64CT::hash((const char*)buf, pos * sizeof(size_t), 20220328);
+    size_t k2 = XXHash64CT::hash((const char*)m_param_ptr, m_param_size, 20220328);
+
+    return {k1, k2};
 }
 
 void HeuristicCache::put(const Key& key, Result& result) {
@@ -126,15 +123,41 @@ void HeuristicCache::put(const Key& key, Result& result) {
         m_heuristic_cache[key.build_key_storage()] = result;
 }
 
+template <typename T>
+bool is_same_buf(
+        const T hash_buf[], const size_t buf_size, const T hash_buf_[],
+        const size_t buf_size_) {
+    if (buf_size != buf_size_) {
+        return false;
+    }
+    for (size_t i = 0; i < buf_size; i++) {
+        if (hash_buf[i] != hash_buf_[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
 HeuristicCache::Result HeuristicCache::get(const Key& key) {
     MEGDNN_LOCK_GUARD(m_mtx);
     KeyStorage ks = key.build_key_storage();
     auto iter = m_heuristic_cache.find(ks);
-    if (iter == m_heuristic_cache.end()) {
-        return {};
-    } else {
-        return iter->second;
+    if (iter != m_heuristic_cache.end()) {
+        if (is_same_buf(
+                    key.m_buf.data(), key.m_buf.size(), iter->second.m_buf.data(),
+                    iter->second.m_buf.size()) &&
+            is_same_buf(
+                    (char*)(key.m_param_ptr), key.m_param_size,
+                    iter->second.m_param_buf.data(), iter->second.m_param_buf.size())) {
+            return iter->second;
+        }
+        megdnn_log_warn(
+                "hash collision occurs in heuristic cache with key: (%zu, %zu)", ks.k1,
+                ks.k2);
     }
+    SmallVector<char> param_buf(
+            (char*)key.m_param_ptr, (char*)key.m_param_ptr + key.m_param_size);
+    return Result{{}, 0, key.m_buf, param_buf};
 }
 
 void HeuristicCache::clear() {
diff --git a/imperative/src/impl/algo_chooser.h b/imperative/src/impl/algo_chooser.h
index 454c8723..d8e481f3 100644
--- a/imperative/src/impl/algo_chooser.h
+++ b/imperative/src/impl/algo_chooser.h
@@ -18,6 +18,8 @@ MGE_WIN_DECLSPEC_FUC size_t setup_algo(
         megdnn_opr->execution_policy() = rst.policy;
         return rst.workspace;
     }
+    SmallVector<size_t> buf = rst.m_buf;
+    SmallVector<char> param_buf = rst.m_param_buf;
 
     std::string param_str;
     megdnn::Algorithm::serialize_write_pod(megdnn_opr->param(), param_str);
@@ -40,11 +42,10 @@ MGE_WIN_DECLSPEC_FUC size_t setup_algo(
     megdnn::ExecutionPolicy policy;
     policy = mgb::rdnn::AlgoChooser<Opr>::get_policy(helper);
     size_t workspace = helper.get_workspace_size_bytes(policy, layouts);
-
     megdnn_opr->execution_policy() = policy;
 
     if (execution_policy.strategy & rdnn::ExecutionStrategy::HEURISTIC) {
-        megdnn::HeuristicCache::Result cache_result{policy, workspace};
+        megdnn::HeuristicCache::Result cache_result{policy, workspace, buf, param_buf};
         megdnn::HeuristicCache::instance().put(cache_key, cache_result);
     }
     return workspace;
diff --git a/imperative/src/impl/ops/convolution.cpp b/imperative/src/impl/ops/convolution.cpp
index 0a576123..3c279104 100644
--- a/imperative/src/impl/ops/convolution.cpp
+++ b/imperative/src/impl/ops/convolution.cpp
@@ -123,8 +123,6 @@ TensorLayout do_shape_infer(
 
 std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
         const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) {
-    using Param = ::megdnn::param::Convolution;
-
     SmallVector<LogicalTensorDesc> dests(1);
     auto&& desc = dests[0];
     desc.comp_node = inputs[0].comp_node;
@@ -166,15 +164,16 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
     }
     oup_shapes[0] = out_layout;
     DnnOprCaller<megdnn::ConvBiasForward> dnn_opr(cn);
-    dnn_opr.op->param().pad_h = conv.pad_h;
-    dnn_opr.op->param().pad_w = conv.pad_w;
-    dnn_opr.op->param().stride_h = conv.stride_h;
-    dnn_opr.op->param().stride_w = conv.stride_w;
-    dnn_opr.op->param().dilate_h = conv.dilate_h;
-    dnn_opr.op->param().dilate_w = conv.dilate_w;
-    dnn_opr.op->param().sparse = conv.sparse;
-    dnn_opr.op->param().compute_mode = conv.compute_mode;
-    dnn_opr.op->param().format = conv.format;
+    auto&& param = dnn_opr.op->param();
+    param.pad_h = conv.pad_h;
+    param.pad_w = conv.pad_w;
+    param.stride_h = conv.stride_h;
+    param.stride_w = conv.stride_w;
+    param.dilate_h = conv.dilate_h;
+    param.dilate_w = conv.dilate_w;
+    param.sparse = conv.sparse;
+    param.compute_mode = conv.compute_mode;
+    param.format = conv.format;
 
     // shape infer
     TensorLayout shp({0}, inputs[0]->dtype());
@@ -513,8 +512,6 @@ TensorLayout do_shape_infer(
 
 std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
         const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) {
-    using Param = ::megdnn::param::Convolution3D;
-
     SmallVector<LogicalTensorDesc> dests(1);
     auto&& desc = dests[0];
     desc.comp_node = inputs[0].comp_node;
diff --git a/src/opr/impl/search_policy/algo_chooser.cpp b/src/opr/impl/search_policy/algo_chooser.cpp
index 5ee30dd6..25eeeeb3 100644
--- a/src/opr/impl/search_policy/algo_chooser.cpp
+++ b/src/opr/impl/search_policy/algo_chooser.cpp
@@ -42,6 +42,8 @@ size_t AlgoChooser<Opr>::setup_algo(
         megdnn_opr->execution_policy() = rst.policy;
         return rst.workspace;
     }
+    SmallVector<size_t> buf = rst.m_buf;
+    SmallVector<char> param_buf = rst.m_param_buf;
 
     if (WorkspaceLimitGetter::is_prealloc_run(mgb_opr->owner_graph())) {
         return 0;
@@ -92,7 +94,7 @@ size_t AlgoChooser<Opr>::setup_algo(
     megdnn_opr->execution_policy() = policy;
 
     if (mgb_opr->execution_policy().strategy & rdnn::ExecutionStrategy::HEURISTIC) {
-        HeuristicCache::Result cache_result{policy, workspace};
+        HeuristicCache::Result cache_result{policy, workspace, buf, param_buf};
         HeuristicCache::instance().put(cache_key, cache_result);
     }
     return workspace;