From 86618e49c9289ea41fd82baefa2f51484f53f0bc Mon Sep 17 00:00:00 2001
From: Oceania2018 <haiping008@gmail.com>
Date: Sat, 23 May 2020 06:47:43 -0500
Subject: [PATCH] SGD works.

---
 src/TensorFlowNET.Core/Eager/c_api.eager.cs   |   2 +-
 .../Keras/Optimizers/DeviceDType.cs           |  25 ++++
 .../Keras/Optimizers/OptimizerV2.cs           | 110 +++++++++++++-----
 .../Keras/Optimizers/SGD.cs                   |  25 ++++
 .../Operations/gen_math_ops.cs                |  13 +++
 .../Operations/gen_resource_variable_ops.cs   |  26 +++++
 .../Tensors/Tensor.Value.cs                   |   3 +
 src/TensorFlowNET.Core/Tensors/constant_op.cs |   3 +
 .../Training/gen_training_ops.py.cs           |  33 ++++++
 .../Variables/ResourceVariable.Functions.cs   |  12 ++
 src/TensorFlowNET.Core/tensorflow.cs          |  18 ++-
 11 files changed, 238 insertions(+), 32 deletions(-)
 create mode 100644 src/TensorFlowNET.Core/Keras/Optimizers/DeviceDType.cs
diff --git a/src/TensorFlowNET.Core/Eager/c_api.eager.cs b/src/TensorFlowNET.Core/Eager/c_api.eager.cs
index 148790c0..8946808c 100644
--- a/src/TensorFlowNET.Core/Eager/c_api.eager.cs
+++ b/src/TensorFlowNET.Core/Eager/c_api.eager.cs
@@ -12,7 +12,7 @@ namespace Tensorflow
 
         [UnmanagedFunctionPointer(CallingConvention.StdCall)]
         public delegate IntPtr _gradient_function_callback(string op_name,
-            BindingArray op_inputs,
+            IntPtr op_inputs,
             BindingArray op_outputs,
             int num_attrs,
             BindingArray output_grads, 
diff --git a/src/TensorFlowNET.Core/Keras/Optimizers/DeviceDType.cs b/src/TensorFlowNET.Core/Keras/Optimizers/DeviceDType.cs
new file mode 100644
index 00000000..d3aa5590
--- /dev/null
+++ b/src/TensorFlowNET.Core/Keras/Optimizers/DeviceDType.cs
@@ -0,0 +1,25 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace Tensorflow.Keras.Optimizers
+{
+    public class DeviceDType : IEqualityComparer<DeviceDType>
+    {
+        public string Device { get; set; }
+        public TF_DataType DType { get; set; }
+
+        public bool Equals(DeviceDType x, DeviceDType y)
+        {
+            return x.ToString() == y.ToString();
+        }
+
+        public int GetHashCode(DeviceDType obj)
+        {
+            return 0;
+        }
+
+        public override string ToString()
+            => $"{Device}, {DType}";
+    }
+}
diff --git a/src/TensorFlowNET.Core/Keras/Optimizers/OptimizerV2.cs b/src/TensorFlowNET.Core/Keras/Optimizers/OptimizerV2.cs
index 1beae7cd..e2c4808d 100644
--- a/src/TensorFlowNET.Core/Keras/Optimizers/OptimizerV2.cs
+++ b/src/TensorFlowNET.Core/Keras/Optimizers/OptimizerV2.cs
@@ -5,6 +5,8 @@ using System.Text;
 using Tensorflow.Keras.Utils;
 using Tensorflow.Train;
 using static Tensorflow.Binding;
+using Tensorflow;
+using Tensorflow.Eager;
 
 namespace Tensorflow.Keras.Optimizers
 {
@@ -17,18 +19,32 @@ namespace Tensorflow.Keras.Optimizers
         protected virtual string _name { get; }
 
         ResourceVariable _iterations;
-        List<ResourceVariable> _weight = new List<ResourceVariable>();
-        Dictionary<string, float> _hyper = new Dictionary<string, float>();
-        Dictionary<string, ResourceVariable> _hyper_variables = new Dictionary<string, ResourceVariable>();
+        List<ResourceVariable> _weight;
+        Dictionary<string, float> _hyper;
+        Dictionary<string, ResourceVariable> _hyper_variables;
         protected bool _momentum;
         protected float _initial_decay = 0.0f;
+        protected bool _use_locking = true;
+
+        Dictionary<DeviceDType, Dictionary<string, Tensor>> apply_state;
 
         public OptimizerV2() : base()
         {
-
+            _weight = new List<ResourceVariable>();
+            _hyper = new Dictionary<string, float>();
+            _hyper_variables = new Dictionary<string, ResourceVariable>();
+            apply_state = new Dictionary<DeviceDType, Dictionary<string, Tensor>>();
         }
 
-        public void apply_gradients(IEnumerable<(Tensor, ResourceVariable)> grads_and_vars)
+        /// <summary>
+        /// Apply gradients to variables.
+        /// </summary>
+        /// <param name="grads_and_vars"></param>
+        /// <param name="name"></param>
+        /// <param name="experimental_aggregate_gradients"></param>
+        public void apply_gradients(IEnumerable<(Tensor, ResourceVariable)> grads_and_vars,
+            string name = null,
+            bool experimental_aggregate_gradients = true)
         {
             var var_list = grads_and_vars.Select(x => x.Item2).ToArray();
             tf_with(ops.name_scope(_name), delegate
@@ -38,49 +54,91 @@ namespace Tensorflow.Keras.Optimizers
                 if (grads_and_vars == null || grads_and_vars.Count() == 0)
                     return control_flow_ops.no_op();
 
-                //var apply_state = 
-                _prepare(var_list);
-
-                _aggregate_gradients(grads_and_vars);
+                apply_state = _prepare(var_list);
+                if(experimental_aggregate_gradients)
+                {
+                    // var reduced_grads = _aggregate_gradients(grads_and_vars);
+                    _distributed_apply(grads_and_vars, name, apply_state);
+                }
 
                 return null;
             });
         }
 
-        void _aggregate_gradients(IEnumerable<(Tensor, ResourceVariable)> grads_and_vars)
+        void apply_grad_to_update_var(ResourceVariable var, EagerTensor grad)
+        {
+            _resource_apply_dense(var, grad, apply_state);
+        }
+
+        protected virtual Operation _resource_apply_dense(ResourceVariable var, 
+            EagerTensor grad, 
+            Dictionary<DeviceDType, Dictionary<string, Tensor>> _apply_state)
+        {
+            throw new NotImplementedException("_resource_apply_dense");
+        }
+
+        void _distributed_apply(IEnumerable<(Tensor, ResourceVariable)> grads_and_vars,
+            string name,
+            Dictionary<DeviceDType, Dictionary<string, Tensor>> _apply_state)
+        {
+            tf_with(ops.name_scope(name, "", new { skip_on_eager = true }), delegate
+            {
+                foreach(var (grad, var) in grads_and_vars)
+                {
+                    tf_with(ops.name_scope("update"), delegate
+                    {
+                        apply_grad_to_update_var(var, grad as EagerTensor);
+                    });
+                }
+
+                _iterations.assign_add(ops.convert_to_tensor(1, dtype: _iterations.dtype));
+            });
+        }
+
+        Tensor[] _aggregate_gradients(IEnumerable<(Tensor, ResourceVariable)> grads_and_vars)
+        {
+            return grads_and_vars.Select(x => x.Item1).ToArray();
+        }
+
+        Dictionary<DeviceDType, Dictionary<string, Tensor>> _prepare(ResourceVariable[] var_list)
         {
-            var lr_t = _hyper_variables["learning_rate"];
-            foreach (var grad_and_var in grads_and_vars)
+            var _apply_state = new Dictionary<DeviceDType, Dictionary<string, Tensor>>();
+            var keys = var_list.Select(x => new DeviceDType
             {
-                var grad = grad_and_var.Item1;
-                var variable = grad_and_var.Item2;
-                // variable.Handle - grad * lr_t.Handle;
+                Device = x.Device,
+                DType = x.dtype.as_base_dtype()
+            }).Distinct(new DeviceDType()).ToArray();
+
+            foreach(var device_dtype in keys)
+            {
+                _apply_state[device_dtype] = new Dictionary<string, Tensor>();
+                _prepare_local(device_dtype, _apply_state);
             }
+
+            return _apply_state;
         }
 
-        void _prepare(ResourceVariable[] var_list)
+        protected virtual void _prepare_local(DeviceDType device_dtype, 
+            Dictionary<DeviceDType, Dictionary<string, Tensor>> _apply_state)
         {
-            var keys = new HashSet<(string, TF_DataType)>();
-            foreach(var variable in var_list)
+            if (_hyper.ContainsKey("learning_rate"))
             {
-                var lr_t = _prepare_local(variable.Device, variable.dtype.as_base_dtype());
-                var momentum = _get_hyper("momentum", variable.dtype);
-                array_ops.identity(momentum);
+                var lr_t = array_ops.identity(_decayed_lr(device_dtype.DType));
+                _apply_state[device_dtype]["lr_t"] = lr_t;
             }
         }
 
-        ResourceVariable _prepare_local(string var_device, TF_DataType var_dtype)
+        Tensor _decayed_lr(TF_DataType var_dtype)
         {
             var lr_t = _get_hyper("learning_rate", var_dtype);
-            if(_initial_decay > 0)
+            if(_initial_decay > 0.0f)
             {
-
+                throw new NotImplementedException("");
             }
-
             return lr_t;
         }
 
-        ResourceVariable _get_hyper(string name, TF_DataType dtype = TF_DataType.DtInvalid)
+        protected ResourceVariable _get_hyper(string name, TF_DataType dtype = TF_DataType.DtInvalid)
         {
             var value = _hyper_variables[name];
             return math_ops.cast(value, dtype);
diff --git a/src/TensorFlowNET.Core/Keras/Optimizers/SGD.cs b/src/TensorFlowNET.Core/Keras/Optimizers/SGD.cs
index 975854a6..03be366e 100644
--- a/src/TensorFlowNET.Core/Keras/Optimizers/SGD.cs
+++ b/src/TensorFlowNET.Core/Keras/Optimizers/SGD.cs
@@ -1,6 +1,8 @@
 ﻿using System;
 using System.Collections.Generic;
+using System.Linq;
 using System.Text;
+using Tensorflow.Eager;
 
 namespace Tensorflow.Keras.Optimizers
 {
@@ -24,5 +26,28 @@ namespace Tensorflow.Keras.Optimizers
 
             nesterov = nesterov;
         }
+
+        protected override void _prepare_local(DeviceDType device_dtype, 
+            Dictionary<DeviceDType, Dictionary<string, Tensor>> _apply_state)
+        {
+            base._prepare_local(device_dtype, _apply_state);
+
+            _apply_state[device_dtype]["momentum"] = array_ops.identity(
+                _get_hyper("momentum", device_dtype.DType));
+        }
+
+        protected override Operation _resource_apply_dense(ResourceVariable var, EagerTensor grad, Dictionary<DeviceDType, Dictionary<string, Tensor>> _apply_state)
+        {
+            if (_momentum)
+            {
+                throw new NotImplementedException("_resource_apply_dense");
+            }
+            var device_dtype = _apply_state.Keys.FirstOrDefault(x => x.Device == var.Device && x.DType == var.dtype.as_base_dtype());
+
+            return gen_training_ops.resource_apply_gradient_descent(var.Handle as EagerTensor, 
+                _apply_state[device_dtype]["lr_t"] as EagerTensor, 
+                grad,
+                use_locking: _use_locking);
+        }
     }
 }
diff --git a/src/TensorFlowNET.Core/Operations/gen_math_ops.cs b/src/TensorFlowNET.Core/Operations/gen_math_ops.cs
index 9d2f556c..2a37d290 100644
--- a/src/TensorFlowNET.Core/Operations/gen_math_ops.cs
+++ b/src/TensorFlowNET.Core/Operations/gen_math_ops.cs
@@ -894,6 +894,19 @@ namespace Tensorflow
 
         public static Tensor floor_mod(Tensor x, Tensor y, string name = null)
         {
+            if (tf.context.executing_eagerly())
+            {
+                using var status = new Status();
+                EagerTensorHandle tensor = c_api.TFE_FastPathExecute(tf.context, tf.context.device_name,
+                    "FloorMod", name, new IntPtr[]
+                    {
+                        x as EagerTensor,
+                        y as EagerTensor
+                    }, 2, null, status);
+                status.Check(true);
+                return tensor;
+            }
+
             var _op = _op_def_lib._apply_op_helper("FloorMod", name, args: new { x, y });
 
             return _op.outputs[0];
diff --git a/src/TensorFlowNET.Core/Operations/gen_resource_variable_ops.cs b/src/TensorFlowNET.Core/Operations/gen_resource_variable_ops.cs
index b7b9fcd2..9a224e5f 100644
--- a/src/TensorFlowNET.Core/Operations/gen_resource_variable_ops.cs
+++ b/src/TensorFlowNET.Core/Operations/gen_resource_variable_ops.cs
@@ -44,6 +44,32 @@ namespace Tensorflow
             return null;
         }
 
+        /// <summary>
+        /// Adds a value to the current value of a variable.
+        /// </summary>
+        /// <param name="resource"></param>
+        /// <param name="value"></param>
+        /// <param name="name"></param>
+        /// <returns></returns>
+        public static Operation assign_add_variable_op(Tensor resource, Tensor value, string name = null)
+        {
+            if (tf.context.executing_eagerly())
+            {
+                using var status = new Status();
+                var tensor = c_api.TFE_FastPathExecute(tf.context, tf.context.device_name,
+                    "AssignAddVariableOp", name,
+                    new IntPtr[]
+                    {
+                        resource as EagerTensor,
+                        value as EagerTensor
+                    }, 2, null, status);
+                status.Check(true);
+                return tensor;
+            }
+
+            return null;
+        }
+
         public static Operation assign_variable_op(Tensor resource, Tensor value, string name = null)
         {
             if (tf.context.executing_eagerly())
diff --git a/src/TensorFlowNET.Core/Tensors/Tensor.Value.cs b/src/TensorFlowNET.Core/Tensors/Tensor.Value.cs
index 3fdb3bb9..440fd086 100644
--- a/src/TensorFlowNET.Core/Tensors/Tensor.Value.cs
+++ b/src/TensorFlowNET.Core/Tensors/Tensor.Value.cs
@@ -163,6 +163,9 @@ namespace Tensorflow
                 case TF_DataType.TF_INT32:
                     storage = new UnmanagedStorage(NPTypeCode.Int32);
                     break;
+                case TF_DataType.TF_INT64:
+                    storage = new UnmanagedStorage(NPTypeCode.Int64);
+                    break;
                 case TF_DataType.TF_FLOAT:
                     storage = new UnmanagedStorage(NPTypeCode.Float);
                     break;
diff --git a/src/TensorFlowNET.Core/Tensors/constant_op.cs b/src/TensorFlowNET.Core/Tensors/constant_op.cs
index 6c684dc5..c8ad5fb0 100644
--- a/src/TensorFlowNET.Core/Tensors/constant_op.cs
+++ b/src/TensorFlowNET.Core/Tensors/constant_op.cs
@@ -124,6 +124,9 @@ namespace Tensorflow
                     case TF_DataType.TF_FLOAT:
                         value = Convert.ToSingle(value);
                         break;
+                    case TF_DataType.TF_INT64:
+                        value = Convert.ToInt64(value);
+                        break;
                     default:
                         break;
                 }
diff --git a/src/TensorFlowNET.Core/Training/gen_training_ops.py.cs b/src/TensorFlowNET.Core/Training/gen_training_ops.py.cs
index 7235ce7b..dc162865 100644
--- a/src/TensorFlowNET.Core/Training/gen_training_ops.py.cs
+++ b/src/TensorFlowNET.Core/Training/gen_training_ops.py.cs
@@ -14,6 +14,10 @@
    limitations under the License.
 ******************************************************************************/
 
+using System;
+using Tensorflow.Eager;
+using static Tensorflow.Binding;
+
 namespace Tensorflow
 {
     public class gen_training_ops
@@ -55,5 +59,34 @@ namespace Tensorflow
 
             return _op.outputs[0];
         }
+
+        public static Operation resource_apply_gradient_descent(EagerTensor var, EagerTensor alpha, EagerTensor delta, bool use_locking = false, string name = null)
+        {
+            if (tf.context.executing_eagerly())
+            {
+                using var status = new Status();
+                var tensor = c_api.TFE_FastPathExecute(tf.context, tf.context.device_name,
+                    "ResourceApplyGradientDescent", name, new IntPtr[]
+                    {
+                        var,
+                        alpha,
+                        delta
+                    }, 3, 
+                    op => wrap_tfe_src.SetOpAttrs(op, "use_locking", use_locking), 
+                    status);
+                status.Check(true);
+                return tensor;
+            }
+
+            var _op = _op_def_lib._apply_op_helper("ResourceApplyGradientDescent", name, new
+            {
+                var,
+                alpha,
+                delta,
+                use_locking
+            });
+
+            return _op.outputs[0];
+        }
     }
 }
diff --git a/src/TensorFlowNET.Core/Variables/ResourceVariable.Functions.cs b/src/TensorFlowNET.Core/Variables/ResourceVariable.Functions.cs
index 7b5e3232..1978d60a 100644
--- a/src/TensorFlowNET.Core/Variables/ResourceVariable.Functions.cs
+++ b/src/TensorFlowNET.Core/Variables/ResourceVariable.Functions.cs
@@ -33,5 +33,17 @@ namespace Tensorflow
         {
             gen_resource_variable_ops.assign_sub_variable_op(handle, delta, name: name);
         }
+
+        /// <summary>
+        /// Adds a value to this variable.
+        /// </summary>
+        /// <param name="delta"></param>
+        /// <param name="use_locking"></param>
+        /// <param name="name"></param>
+        /// <param name="read_value"></param>
+        public void assign_add(Tensor delta, bool use_locking = false, string name = null, bool read_value = true)
+        {
+            gen_resource_variable_ops.assign_add_variable_op(handle, delta, name: name);
+        }
     }
 }
diff --git a/src/TensorFlowNET.Core/tensorflow.cs b/src/TensorFlowNET.Core/tensorflow.cs
index 732ab264..de2fe450 100644
--- a/src/TensorFlowNET.Core/tensorflow.cs
+++ b/src/TensorFlowNET.Core/tensorflow.cs
@@ -57,21 +57,28 @@ namespace Tensorflow
                 for (int i = 0; i < num_grads; i++)
                     input_grads[i] = new EagerTensor(*((IntPtr*)gradients + i));
 
-                var add_n = gen_math_ops.add_n(input_grads);
-                return (add_n as EagerTensor).EagerTensorHandle;
+                var add_n = gen_math_ops.add_n(input_grads) as EagerTensor;
+                return add_n.EagerTensorHandle;
             });
 
             ops.RegisterFromAssembly();
-            c_api.TFE_RegisterGradientFunction((op_name, op_inputs, op_outputs, num_attrs, output_grads, skip_input_indices) =>
+            c_api.TFE_RegisterGradientFunction((op_name, op_inputs_handle, op_outputs, num_attrs, output_grads, skip_input_indices) =>
             {
+                var op_inputs = Marshal.PtrToStructure<BindingArray>(op_inputs_handle);
                 var input_tensors = new EagerTensor[op_inputs.length];
                 for (int i = 0; i < op_inputs.length; i++)
+                {
+                    // Console.WriteLine($"debug 4: {op_name} op_inputs=" + (*(IntPtr*)op_inputs_handle).ToString("x16").ToUpper() + $" op_inputs[{i}]=" + (*((IntPtr*)op_inputs.array + i)).ToString("x16").ToUpper());
+                    if((*((IntPtr*)op_inputs.array + i)).ToString("x16").ToUpper().StartsWith("FFFFF"))
+                    {
+
+                    }
                     input_tensors[i] = new EagerTensor(*((IntPtr*)op_inputs.array + i));
+                }
 
                 var output_tensors = new EagerTensor[op_outputs.length];
                 for (int i = 0; i < op_outputs.length; i++)
-                    if (op_outputs.array != IntPtr.Zero)
-                        output_tensors[i] = new EagerTensor(*((IntPtr*)op_outputs.array + i));
+                    output_tensors[i] = new EagerTensor(*((IntPtr*)op_outputs.array + i));
 
                 var output_grad_tensors = new EagerTensor[output_grads.length];
                 for (int i = 0; i < output_grads.length; i++)
@@ -85,6 +92,7 @@ namespace Tensorflow
                 {
                     NumInputs = input_tensors.Length,
                     Inputs = input_tensors,
+                    NumOutputs = output_tensors.Length,
                     Outputs = output_tensors,
                     SkipInputIndices = skip_input_indices_param
                 }, output_grad_tensors);