!21205 [MS][LITE][TOD] memory optimization

Merge pull request !21205 from yonibaehr/export_yoni
4 years ago · 7383457756
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -133,6 +133,7 @@ set(TRAIN_SRC
        ${CMAKE_CURRENT_SOURCE_DIR}/train/accuracy_monitor.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/train/classification_train_accuracy_monitor.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/train/train_export.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/train/opt_allocator.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/../tools/common/storage.cc
        )
 if(ENABLE_V0)
--- a/mindspore/lite/src/tensor.cc
+++ b/mindspore/lite/src/tensor.cc
@@ -316,7 +316,9 @@ void Tensor::FreeData() {
    this->data_ = nullptr;
  } else {
    allocator_->Free(this->data_);
    this->data_ = nullptr;
    if (!IS_STATIC_ALLOCATOR(allocator_) || (allocator_->RefCount(this->data_) != 0)) {
      this->data_ = nullptr;
    }
  }
 }

--- a/mindspore/lite/src/tensor.h
+++ b/mindspore/lite/src/tensor.h
@@ -34,12 +34,15 @@

 namespace mindspore {
 namespace lite {

 #define STATIC_ALLOCATION -271964
 #define IS_STATIC_ALLOCATOR(allocator) ((allocator != nullptr) && (allocator->RefCount(nullptr) == STATIC_ALLOCATION))
 struct LiteQuantParam {
  double scale;
  int32_t zeroPoint;
  float var_corr{1};
  float mean_corr{0};
  bool inited;
  bool inited{false};
  std::vector<float> clusters{};
  int bitNum;
  int roundType;
@@ -133,7 +136,6 @@ class Tensor : public mindspore::tensor::MSTensor {
  void set_format(mindspore::Format format) override { this->format_ = format; }

  mindspore::Format format() const override { return this->format_; }

  virtual int ref_count() const { return ref_count_; }

  virtual int init_ref_count() const { return this->init_ref_count_; }
--- a/mindspore/lite/src/train/opt_allocator.cc
+++ b/mindspore/lite/src/train/opt_allocator.cc
@@ -0,0 +1,90 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "src/train/opt_allocator.h"
 #include <limits>
 #include "nnacl/op_base.h"

 namespace mindspore {

 size_t OptAllocator::FindFree(size_t size) {
  size_t min_size = std::numeric_limits<size_t>::max();
  size_t min_addr = std::numeric_limits<size_t>::max();
  for (auto const &itr : arena_) {
    // best fit
    if (itr.second >= size) {
      if (min_size > itr.second) {
        min_size = itr.second;
        min_addr = itr.first;
      }
    }
  }
  return min_addr;
 }

 void OptAllocator::Reorder(size_t addr) {
  size_t length = arena_[addr];
  size_t post = addr + length;
  // connect to upper block
  auto it = arena_.find(post);
  if (it != arena_.end()) {
    size_t post_size = it->second;
    arena_[addr] = length + post_size;
    arena_.erase(post);
  }
  // connect to lower block
  auto itr = arena_.lower_bound(addr);
  if (itr != arena_.begin()) {
    itr--;
    size_t last = itr->first;
    if ((last + arena_[last]) == addr) {
      arena_[last] = arena_[last] + arena_[addr];
      arena_.erase(addr);
    }
  }
 }

 size_t OptAllocator::Malloc(size_t size) {
  size = UP_DIV(size, align_size_) * align_size_;
  size_t addr = FindFree(size);
  // free block not found
  if (addr == std::numeric_limits<size_t>::max()) {
    if (!arena_.empty()) {
      addr = arena_.rbegin()->first;
      if (addr + arena_[addr] < heap_) {
        addr = heap_;
      } else {
        arena_.erase(addr);
      }
    } else {
      addr = heap_;
    }
    heap_ = addr + size;
  } else {
    if (arena_[addr] > size) {
      arena_[addr + size] = arena_[addr] - size;
    }
    arena_.erase(addr);
  }
  alloc_[addr] = size;
  return addr;
 }

 void OptAllocator::Free(size_t addr) {
  arena_[addr] = alloc_[addr];
  alloc_.erase(addr);
  Reorder(addr);
 }
 }  // namespace mindspore
--- a/mindspore/lite/src/train/opt_allocator.h
+++ b/mindspore/lite/src/train/opt_allocator.h
@@ -0,0 +1,41 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_TRAIN_OPT_ALLOCATOR_H_
 #define MINDSPORE_LITE_SRC_TRAIN_OPT_ALLOCATOR_H_

 #include <map>
 #include "include/api/allocator.h"

 namespace mindspore {
 class OptAllocator {
 public:
  explicit OptAllocator(size_t aligned_size = 32) : align_size_(aligned_size) {}
  ~OptAllocator() {}
  size_t Malloc(size_t size);
  void Free(size_t offset);
  size_t total_size() { return heap_; }

 private:
  size_t FindFree(size_t size);
  void Reorder(size_t addr);
  std::map<size_t, size_t> arena_;
  std::map<size_t, size_t> alloc_;
  size_t heap_ = 0;
  size_t align_size_;
 };
 };      // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_TRAIN_OPT_ALLOCATOR_H_
--- a/mindspore/lite/src/train/static_allocator.h
+++ b/mindspore/lite/src/train/static_allocator.h
@@ -0,0 +1,52 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_SRC_TRAIN_STATIC_ALLOCATOR_H_
 #define MINDSPORE_LITE_SRC_TRAIN_STATIC_ALLOCATOR_H_

 namespace mindspore {
 class StaticAllocator : public Allocator {
 public:
  void SetContex(void *buf, size_t size) {
    start_buf_ = buf;
    size_ = size;
  }
  int SetRefCount(void *ptr, int ref_count) override { return 0; }
  int DecRefCount(void *ptr, int ref_count) override { return 0; }
  int IncRefCount(void *ptr, int ref_count) override { return 0; }
  size_t total_size() { return total_size_; }
  void Clear() {}
  void *Malloc(size_t size) override {
    total_size_ += size;
    return malloc(size);
  }
  void Free(void *ptr) override {
    if (RefCount(ptr) != 0) free(ptr);
  }

  int RefCount(void *ptr) override {
    if (ptr == nullptr) return STATIC_ALLOCATION;
    char *ptrc = reinterpret_cast<char *>(ptr);
    char *bufc = reinterpret_cast<char *>(start_buf_);
    return ((ptrc < bufc) || (ptrc - bufc >= static_cast<ptrdiff_t>(size_)) ? 1 : 0);
  }

 private:
  void *start_buf_;
  size_t size_;
  size_t total_size_ = 0;
 };
 };      // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_TRAIN_STATIC_ALLOCATOR_H_
--- a/mindspore/lite/src/train/train_session.cc
+++ b/mindspore/lite/src/train/train_session.cc
@@ -39,6 +39,8 @@
 #include "src/train/optimizer_kernel.h"
 #include "src/train/train_utils.h"
 #include "src/train/train_export.h"
 #include "src/train/opt_allocator.h"
 #include "src/train/static_allocator.h"
 #include "src/train/train_populate_parameter.h"
 #include "src/train/train_populate_parameter_v0.h"

@@ -68,6 +70,7 @@ int TrainSession::Init(const Context *context, const TrainCfg *train_cfg) {
    }
    cfg_ = *train_cfg;
  }
  allocator_ = context->allocator;
  return lite::LiteSession::Init(context);
 }

@@ -159,6 +162,51 @@ int TrainSession::InitCallBack() {
  return RET_OK;
 }

 int TrainSession::AllocTensors(const std::vector<kernel::LiteKernel *> &kernels) {
  if (!IS_STATIC_ALLOCATOR(allocator_)) return RET_OK;
  OptAllocator allocator;
  std::unordered_map<lite::Tensor *, int> ref_count;
  std::unordered_map<lite::Tensor *, size_t> offset_map;
  for (auto kernel : kernels) {
    for (auto tensor : kernel->out_tensors()) {
      size_t size = tensor->Size();
      size_t offset = allocator.Malloc(size);
      offset_map[tensor] = offset;
      ref_count[tensor] = tensor->init_ref_count();
    }
    for (auto tensor : kernel->in_tensors()) {
      if (tensor->category() == lite::Tensor::VAR) {
        int count = ref_count[tensor] - 1;
        ref_count[tensor] = count;
        if (count == 0) {
          allocator.Free(offset_map[tensor]);
        }
      }
    }
  }
  // Set Tensor data
  if (tensors_data_ == nullptr) {
    auto size = allocator.total_size();
    auto buf = malloc(size);
    if (buf == nullptr) {
      MS_LOG(ERROR) << "cannot allocate buffer size" << size;
      return RET_ERROR;
    }
    StaticAllocator *alloc = reinterpret_cast<StaticAllocator *>(allocator_.get());
    alloc->SetContex(buf, size);
    tensors_data_ = buf;
  }
  for (auto kernel : train_kernels_) {
    for (auto tensor : kernel->out_tensors()) {
      auto it = offset_map.find(tensor);
      if (it != offset_map.end()) {
        tensor->set_data(reinterpret_cast<void *>(reinterpret_cast<char *>(tensors_data_) + it->second));
      }
    }
  }
  return RET_OK;
 }

 int TrainSession::CompileGraph(lite::Model *model) { return lite::RET_ERROR; }

 int TrainSession::CompileTrainGraph(std::shared_ptr<Model> model) {
@@ -194,10 +242,21 @@ int TrainSession::CompileTrainGraph(std::shared_ptr<Model> model) {
    MS_LOG(ERROR) << "failed to allocate space";
    return RET_ERROR;
  }
  ret = AllocTensors(train_kernels_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "failed to allocate space";
    return RET_ERROR;
  }
  return RET_OK;
 }

 TrainSession::~TrainSession() { FreeWorkSpace(); }
 TrainSession::~TrainSession() {
  FreeWorkSpace();
  if (tensors_data_ != nullptr) {
    free(tensors_data_);
    tensors_data_ = nullptr;
  }
 }

 int TrainSession::ExecKernels(const KernelCallBack &before, const KernelCallBack &after,
                              const std::vector<kernel::LiteKernel *> &run_kernels) {
@@ -420,6 +479,12 @@ int TrainSession::Train() {
      lite_tensor->set_init_ref_count(lite_tensor->init_ref_count() + 1);
    }
  }
  // allocate tensors
  auto ret = AllocTensors(train_kernels_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "failed to allocate tensor space";
    return RET_ERROR;
  }
  return RET_OK;
 }

@@ -446,6 +511,11 @@ int TrainSession::Eval() {
      lite_tensor->set_init_ref_count(lite_tensor->init_ref_count() + 1);
    }
  }
  auto ret = AllocTensors(inference_kernels_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "failed to allocate space";
    return RET_ERROR;
  }
  return RET_OK;
 }

@@ -781,7 +851,12 @@ session::LiteSession *session::TrainSession::CreateTrainSession(const std::strin
    MS_LOG(ERROR) << "create session failed";
    return nullptr;
  }

  if (context->allocator == nullptr) {
    const_cast<lite::Context *>(context)->allocator = std::shared_ptr<Allocator>(new (std::nothrow) StaticAllocator());
    if (context->allocator == nullptr) {
      MS_LOG(ERROR) << " cannot convert to static allocation";
    }
  }
  auto ret = session->Init(context, cfg);
  if (ret != mindspore::lite::RET_OK) {
    MS_LOG(ERROR) << "init session failed";
--- a/mindspore/lite/src/train/train_session.h
+++ b/mindspore/lite/src/train/train_session.h
@@ -147,6 +147,7 @@ class TrainSession : virtual public lite::LiteSession {
  void FreeRestoreTensors();
  bool AllInputsNeedScale(kernel::LiteKernel *kernel);
  void FreeWorkSpace();
  int AllocTensors(const std::vector<kernel::LiteKernel *> &kernels);

  std::map<Tensor *, Tensor *> restored_origin_tensors_;
  int virtual_batch_idx_ = 0;
@@ -155,6 +156,8 @@ class TrainSession : virtual public lite::LiteSession {
  void *workspace_ = nullptr;
  SchedCallBack sched_mix_precision_callback_;
  bool train_mode_ = false;
  void *tensors_data_ = nullptr;
  std::shared_ptr<Allocator> allocator_;
 };

 }  // namespace lite
--- a/mindspore/lite/tools/benchmark_train/net_train.cc
+++ b/mindspore/lite/tools/benchmark_train/net_train.cc
@@ -603,7 +603,7 @@ int NetTrain::InitCallbackParameter() {
    }
    op_call_times_total_++;
    op_begin_ = GetTimeUs();
    if ((callParam.node_type == "Adam") || (callParam.node_type == "Assign")) {
    if ((callParam.node_type == "Adam") || (callParam.node_type == "Assign") || callParam.node_type == "SGD") {
      for (auto tensor : before_outputs) {
        std::fill(reinterpret_cast<int8_t *>(tensor->MutableData()),
                  reinterpret_cast<int8_t *>(tensor->MutableData()) + tensor->Size(), 0);