From baf620a3e875e7cf6cfa82eb3c56392e2b7fab9a Mon Sep 17 00:00:00 2001
From: dogvane <dogvane@gmail.com>
Date: Sun, 8 Oct 2023 22:06:15 +0800
Subject: [PATCH] =?UTF-8?q?=E8=A7=A3=E5=86=B3keras=E6=A8=A1=E5=BC=8F?=
 =?UTF-8?q?=E4=B8=8B=EF=BC=8C=E4=BD=BF=E7=94=A8GPU=E8=AE=AD=E7=BB=83?=
 =?UTF-8?q?=E6=97=B6=E4=BC=9A=E7=88=86=E6=98=BE=E5=AD=98=E7=9A=84bug?=
 =?UTF-8?q?=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

观察到的现象是，一些模型增大batchsize后，会在首个epoch的中途爆显存不足，只要过了一个epoch后，就能完整训练。同样的batchsize在python下能设置大得多的值。
最后使用最小训练代码分析出，是每个step之后，图片加载到显存里的数据没有释放导致的。
在寻找释放显存接口没有结果的时候，直接使用了GC.Collect();可以让显存主动回收。
因此当前的修复方案是在每个step里，都执行一次 GC.Collect(); 用来释放显存资源。
---
 src/TensorFlowNET.Core/Keras/Engine/IModel.cs | 23 +++++++++++++++++++
 .../Engine/Model.Evaluate.cs                  |  3 +++
 src/TensorFlowNET.Keras/Engine/Model.Fit.cs   | 12 +++++-----
 .../Engine/Model.Predict.cs                   |  2 +-
 4 files changed, 33 insertions(+), 7 deletions(-)
diff --git a/src/TensorFlowNET.Core/Keras/Engine/IModel.cs b/src/TensorFlowNET.Core/Keras/Engine/IModel.cs
index 1840f88b..889c76d9 100644
--- a/src/TensorFlowNET.Core/Keras/Engine/IModel.cs
+++ b/src/TensorFlowNET.Core/Keras/Engine/IModel.cs
@@ -24,6 +24,7 @@ public interface IModel : ILayer
             List<ICallback> callbacks = null,
             float validation_split = 0f,
             ValidationDataPack validation_data = null,
+            int validation_step = 10,
             bool shuffle = true,
             Dictionary<int, float> class_weight = null,
             NDArray sample_weight = null,
@@ -47,6 +48,20 @@ public interface IModel : ILayer
             int workers = 1,
             bool use_multiprocessing = false);
 
+    public ICallback fit(IDatasetV2 dataset,
+            int batch_size = -1,
+            int epochs = 1,
+            int verbose = 1,
+            List<ICallback> callbacks = null,
+            IDatasetV2 validation_data = null,
+            int validation_step = 10,   // 间隔多少次会进行一次验证
+            bool shuffle = true,
+            Dictionary<int, float> class_weight = null,
+            int initial_epoch = 0,
+            int max_queue_size = 10,
+            int workers = 1,
+            bool use_multiprocessing = false);
+
     void save(string filepath,
             bool overwrite = true,
             bool include_optimizer = true,
@@ -85,6 +100,14 @@ public interface IModel : ILayer
             int workers = 1,
             bool use_multiprocessing = false);
 
+    public Tensors predict(IDatasetV2 dataset,
+            int batch_size = -1,
+            int verbose = 0,
+            int steps = -1,
+            int max_queue_size = 10,
+            int workers = 1,
+            bool use_multiprocessing = false);
+
     void summary(int line_length = -1, float[] positions = null);
 
     IKerasConfig get_config();
diff --git a/src/TensorFlowNET.Keras/Engine/Model.Evaluate.cs b/src/TensorFlowNET.Keras/Engine/Model.Evaluate.cs
index 94a2e664..474d5e5a 100644
--- a/src/TensorFlowNET.Keras/Engine/Model.Evaluate.cs
+++ b/src/TensorFlowNET.Keras/Engine/Model.Evaluate.cs
@@ -132,6 +132,7 @@ namespace Tensorflow.Keras.Engine
                     var end_step = step + data_handler.StepIncrement;
                     if (!is_val)
                         callbacks.on_test_batch_end(end_step, logs);
+                    GC.Collect();
                 }
             }
             callbacks.on_test_end(logs);
@@ -167,7 +168,9 @@ namespace Tensorflow.Keras.Engine
         Dictionary<string, float> test_step(DataHandler data_handler, Tensors x, Tensors y)
         {
             (x,y) = data_handler.DataAdapter.Expand1d(x, y);
+
             var y_pred = Apply(x, training: false);
+
             var loss = compiled_loss.Call(y, y_pred);
             compiled_metrics.update_state(y, y_pred);
             return metrics.Select(x => (x.Name, x.result())).ToDictionary(x => x.Item1, x => (float)x.Item2);
diff --git a/src/TensorFlowNET.Keras/Engine/Model.Fit.cs b/src/TensorFlowNET.Keras/Engine/Model.Fit.cs
index 689fc9fb..d61211c7 100644
--- a/src/TensorFlowNET.Keras/Engine/Model.Fit.cs
+++ b/src/TensorFlowNET.Keras/Engine/Model.Fit.cs
@@ -41,6 +41,7 @@ namespace Tensorflow.Keras.Engine
             List<ICallback> callbacks = null,
             float validation_split = 0f,
             ValidationDataPack validation_data = null,
+            int validation_step = 10,
             bool shuffle = true,
             Dictionary<int, float> class_weight = null,
             NDArray sample_weight = null,
@@ -147,7 +148,7 @@ namespace Tensorflow.Keras.Engine
             }
         }
 
-        public History fit(IDatasetV2 dataset, 
+        public ICallback fit(IDatasetV2 dataset, 
             int batch_size = -1,
             int epochs = 1,
             int verbose = 1,
@@ -156,7 +157,6 @@ namespace Tensorflow.Keras.Engine
             int validation_step = 10,
             bool shuffle = true,
             Dictionary<int, float> class_weight = null,
-            NDArray sample_weight = null,
             int initial_epoch = 0,
             int max_queue_size = 10,
             int workers = 1,
@@ -170,7 +170,7 @@ namespace Tensorflow.Keras.Engine
                 InitialEpoch = initial_epoch,
                 Epochs = epochs,
                 Shuffle = shuffle,
-                SampleWeight = sample_weight,
+                ClassWeight = class_weight,
                 MaxQueueSize = max_queue_size,
                 Workers = workers,
                 UseMultiprocessing = use_multiprocessing,
@@ -218,6 +218,7 @@ namespace Tensorflow.Keras.Engine
                     var end_step = step + data_handler.StepIncrement;
                     End_step = end_step;
                     callbacks.on_train_batch_end(end_step, logs);
+                    GC.Collect();
                 }
 
                 if (validation_data != null)
@@ -233,11 +234,10 @@ namespace Tensorflow.Keras.Engine
                     callbacks.on_train_batch_end(End_step, logs);
                 }
 
+                GC.Collect();
 
                 callbacks.on_epoch_end(epoch, logs);
 
-                GC.Collect();
-                GC.WaitForPendingFinalizers();
                 if (stop_training)
                 {
                     break;
@@ -282,6 +282,7 @@ namespace Tensorflow.Keras.Engine
                     var end_step = step + data_handler.StepIncrement;
                     End_step = end_step;
                     callbacks.on_train_batch_end(end_step, logs);
+                    GC.Collect();
                 }
 
                 if (validation_data != null)
@@ -301,7 +302,6 @@ namespace Tensorflow.Keras.Engine
                 callbacks.on_epoch_end(epoch, logs);
 
                 GC.Collect();
-                GC.WaitForPendingFinalizers();
                 if (stop_training)
                 {
                     break;
diff --git a/src/TensorFlowNET.Keras/Engine/Model.Predict.cs b/src/TensorFlowNET.Keras/Engine/Model.Predict.cs
index cbe4a729..e3a5aba6 100644
--- a/src/TensorFlowNET.Keras/Engine/Model.Predict.cs
+++ b/src/TensorFlowNET.Keras/Engine/Model.Predict.cs
@@ -102,9 +102,9 @@ namespace Tensorflow.Keras.Engine
                         for (int i = 0; i < batch_outputs.Length; i++)
                             batch_outputs[i] = tf.concat(new Tensor[] { batch_outputs[i], tmp_batch_outputs[i] }, axis: 0);
                     }
-
                     var end_step = step + data_handler.StepIncrement;
                     callbacks.on_predict_batch_end(end_step, new Dictionary<string, Tensors> { { "outputs", batch_outputs } });
+                    GC.Collect();
                 }
             }