From baf620a3e875e7cf6cfa82eb3c56392e2b7fab9a Mon Sep 17 00:00:00 2001 From: dogvane Date: Sun, 8 Oct 2023 22:06:15 +0800 Subject: [PATCH] =?UTF-8?q?=E8=A7=A3=E5=86=B3keras=E6=A8=A1=E5=BC=8F?= =?UTF-8?q?=E4=B8=8B=EF=BC=8C=E4=BD=BF=E7=94=A8GPU=E8=AE=AD=E7=BB=83?= =?UTF-8?q?=E6=97=B6=E4=BC=9A=E7=88=86=E6=98=BE=E5=AD=98=E7=9A=84bug?= =?UTF-8?q?=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 观察到的现象是,一些模型增大batchsize后,会在首个epoch的中途爆显存不足,只要过了一个epoch后,就能完整训练。同样的batchsize在python下能设置大得多的值。 最后使用最小训练代码分析出,是每个step之后,图片加载到显存里的数据没有释放导致的。 在寻找释放显存接口没有结果的时候,直接使用了GC.Collect();可以让显存主动回收。 因此当前的修复方案是在每个step里,都执行一次 GC.Collect(); 用来释放显存资源。 --- src/TensorFlowNET.Core/Keras/Engine/IModel.cs | 23 +++++++++++++++++++ .../Engine/Model.Evaluate.cs | 3 +++ src/TensorFlowNET.Keras/Engine/Model.Fit.cs | 12 +++++----- .../Engine/Model.Predict.cs | 2 +- 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/src/TensorFlowNET.Core/Keras/Engine/IModel.cs b/src/TensorFlowNET.Core/Keras/Engine/IModel.cs index 1840f88b..889c76d9 100644 --- a/src/TensorFlowNET.Core/Keras/Engine/IModel.cs +++ b/src/TensorFlowNET.Core/Keras/Engine/IModel.cs @@ -24,6 +24,7 @@ public interface IModel : ILayer List callbacks = null, float validation_split = 0f, ValidationDataPack validation_data = null, + int validation_step = 10, bool shuffle = true, Dictionary class_weight = null, NDArray sample_weight = null, @@ -47,6 +48,20 @@ public interface IModel : ILayer int workers = 1, bool use_multiprocessing = false); + public ICallback fit(IDatasetV2 dataset, + int batch_size = -1, + int epochs = 1, + int verbose = 1, + List callbacks = null, + IDatasetV2 validation_data = null, + int validation_step = 10, // 间隔多少次会进行一次验证 + bool shuffle = true, + Dictionary class_weight = null, + int initial_epoch = 0, + int max_queue_size = 10, + int workers = 1, + bool use_multiprocessing = false); + void save(string filepath, bool overwrite = true, bool include_optimizer = true, @@ -85,6 +100,14 @@ public interface IModel : ILayer int workers = 1, bool use_multiprocessing = false); + public Tensors predict(IDatasetV2 dataset, + int batch_size = -1, + int verbose = 0, + int steps = -1, + int max_queue_size = 10, + int workers = 1, + bool use_multiprocessing = false); + void summary(int line_length = -1, float[] positions = null); IKerasConfig get_config(); diff --git a/src/TensorFlowNET.Keras/Engine/Model.Evaluate.cs b/src/TensorFlowNET.Keras/Engine/Model.Evaluate.cs index 94a2e664..474d5e5a 100644 --- a/src/TensorFlowNET.Keras/Engine/Model.Evaluate.cs +++ b/src/TensorFlowNET.Keras/Engine/Model.Evaluate.cs @@ -132,6 +132,7 @@ namespace Tensorflow.Keras.Engine var end_step = step + data_handler.StepIncrement; if (!is_val) callbacks.on_test_batch_end(end_step, logs); + GC.Collect(); } } callbacks.on_test_end(logs); @@ -167,7 +168,9 @@ namespace Tensorflow.Keras.Engine Dictionary test_step(DataHandler data_handler, Tensors x, Tensors y) { (x,y) = data_handler.DataAdapter.Expand1d(x, y); + var y_pred = Apply(x, training: false); + var loss = compiled_loss.Call(y, y_pred); compiled_metrics.update_state(y, y_pred); return metrics.Select(x => (x.Name, x.result())).ToDictionary(x => x.Item1, x => (float)x.Item2); diff --git a/src/TensorFlowNET.Keras/Engine/Model.Fit.cs b/src/TensorFlowNET.Keras/Engine/Model.Fit.cs index 689fc9fb..d61211c7 100644 --- a/src/TensorFlowNET.Keras/Engine/Model.Fit.cs +++ b/src/TensorFlowNET.Keras/Engine/Model.Fit.cs @@ -41,6 +41,7 @@ namespace Tensorflow.Keras.Engine List callbacks = null, float validation_split = 0f, ValidationDataPack validation_data = null, + int validation_step = 10, bool shuffle = true, Dictionary class_weight = null, NDArray sample_weight = null, @@ -147,7 +148,7 @@ namespace Tensorflow.Keras.Engine } } - public History fit(IDatasetV2 dataset, + public ICallback fit(IDatasetV2 dataset, int batch_size = -1, int epochs = 1, int verbose = 1, @@ -156,7 +157,6 @@ namespace Tensorflow.Keras.Engine int validation_step = 10, bool shuffle = true, Dictionary class_weight = null, - NDArray sample_weight = null, int initial_epoch = 0, int max_queue_size = 10, int workers = 1, @@ -170,7 +170,7 @@ namespace Tensorflow.Keras.Engine InitialEpoch = initial_epoch, Epochs = epochs, Shuffle = shuffle, - SampleWeight = sample_weight, + ClassWeight = class_weight, MaxQueueSize = max_queue_size, Workers = workers, UseMultiprocessing = use_multiprocessing, @@ -218,6 +218,7 @@ namespace Tensorflow.Keras.Engine var end_step = step + data_handler.StepIncrement; End_step = end_step; callbacks.on_train_batch_end(end_step, logs); + GC.Collect(); } if (validation_data != null) @@ -233,11 +234,10 @@ namespace Tensorflow.Keras.Engine callbacks.on_train_batch_end(End_step, logs); } + GC.Collect(); callbacks.on_epoch_end(epoch, logs); - GC.Collect(); - GC.WaitForPendingFinalizers(); if (stop_training) { break; @@ -282,6 +282,7 @@ namespace Tensorflow.Keras.Engine var end_step = step + data_handler.StepIncrement; End_step = end_step; callbacks.on_train_batch_end(end_step, logs); + GC.Collect(); } if (validation_data != null) @@ -301,7 +302,6 @@ namespace Tensorflow.Keras.Engine callbacks.on_epoch_end(epoch, logs); GC.Collect(); - GC.WaitForPendingFinalizers(); if (stop_training) { break; diff --git a/src/TensorFlowNET.Keras/Engine/Model.Predict.cs b/src/TensorFlowNET.Keras/Engine/Model.Predict.cs index cbe4a729..e3a5aba6 100644 --- a/src/TensorFlowNET.Keras/Engine/Model.Predict.cs +++ b/src/TensorFlowNET.Keras/Engine/Model.Predict.cs @@ -102,9 +102,9 @@ namespace Tensorflow.Keras.Engine for (int i = 0; i < batch_outputs.Length; i++) batch_outputs[i] = tf.concat(new Tensor[] { batch_outputs[i], tmp_batch_outputs[i] }, axis: 0); } - var end_step = step + data_handler.StepIncrement; callbacks.on_predict_batch_end(end_step, new Dictionary { { "outputs", batch_outputs } }); + GC.Collect(); } }