|
- #pragma once
- #include <vector>
- #include <memory>
- #include <unordered_set>
- #include <unordered_map>
- #include <algorithm>
- #include <pybind11/pybind11.h>
- #include <pybind11/numpy.h>
- #include <iostream>
- #include "../tensor/tensor.h"
- #include "../math/arith.h"
-
- namespace py = pybind11;
-
- namespace nn {
-
- class Node {
- public:
- std::shared_ptr<tensor::Tensor> data;
- std::vector<std::shared_ptr<Node>> objects;
- std::vector<std::shared_ptr<tensor::Tensor>> gradient;
- public:
- Node() {}
- virtual std::shared_ptr<tensor::Tensor> forward() = 0;
- virtual std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) = 0;
- std::vector<std::shared_ptr<Node>> get_parents() {
- return this->objects;
- }
- std::vector<float> get_data() {
- return this->data->data;
- }
- std::shared_ptr<tensor::Tensor> get_tensor() {
- return this->data;
- }
- // virtual void update(std::shared_ptr<tensor::Tensor> grad, float lr) = 0;
- // virtual void zero_grad() = 0;
- virtual ~Node() {}
- };
-
- class DataNode: public Node {
- public:
- DataNode() {}
- }; // class DataNode
-
- class Parameter: public DataNode {
- public:
- // Parameter(const std::vector<std::size_t>& shape) {
- // this->data = std::make_shared<tensor::Tensor>(shape, true);
- // }
- Parameter(py::array_t<float> array) {
- py::buffer_info info = array.request();
- float* dataPtr = static_cast<float*>(info.ptr);
- std::vector<std::size_t> shape = {};
- for (auto &it: info.shape) {
- shape.push_back(it);
- }
- auto tensor = std::make_shared<tensor::Tensor>(shape);
- std::vector<float> result(dataPtr, dataPtr + info.size);
- tensor->data = result;
- this->data = tensor;
- }
- std::shared_ptr<tensor::Tensor> forward() {
- return this->data;
- };
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) {
- return {gradient};
- };
- void update(std::shared_ptr<tensor::Tensor> grad, double lr) {
- for (auto i = 0; i < this->data->size; i++) {
- this->data->data[i] -= lr * grad->data[i];
- }
- }
- }; // class Parameter
-
- class Constant: public DataNode {
- public:
- Constant(std::shared_ptr<tensor::Tensor> data) {
- this->data = data;
- }
- Constant(py::array_t<float> array) {
- this->data = tensor::pyarray_to_tensor(array);
- }
- std::shared_ptr<tensor::Tensor> forward() {
- return this->data;
- };
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) {
- return {gradient};
- };
- // void update(std::shared_ptr<tensor::Tensor> grad, float lr) {}
- }; // class Constant
-
- class FunctionNode: public Node {
- public:
- FunctionNode(std::shared_ptr<Node> a, std::shared_ptr<Node> b) {
- this->objects.emplace_back(a);
- this->objects.emplace_back(b);
- }
- FunctionNode(std::shared_ptr<Node> a) {
- this->objects.emplace_back(a);
- }
-
- std::shared_ptr<tensor::Tensor> forward() override {
- return nullptr;
- }
- }; //class FunctionNode
-
- class Add: public FunctionNode {
- public:
- Add(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
- this->data = this->forward();
- }
- std::shared_ptr<tensor::Tensor> forward() override {
- auto a = this->objects[0];
- auto b = this->objects[1];
- auto outNode = std::make_shared<tensor::Tensor>(a->data->shape);
- for (auto i = 0; i < a->data->size; i++) {
- outNode->data[i] = a->data->data[i] + b->data->data[i];
- }
- return outNode;
- }
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- // assertion needed
- return {gradient, gradient};
- }
- };
-
- class AddBias: public FunctionNode {
- public:
- AddBias(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
- this->data = this->forward();
- }
- std::shared_ptr<tensor::Tensor> forward() override {
- // features: a Node with shape (batch_size x num_features)
- // bias: a Node with shape (1 x num_features)
- auto features = this->objects[0];
- auto bias = this->objects[1];
- auto outNode = std::make_shared<tensor::Tensor>(features->data->shape);
- // for循环写加法总会写吧🤔
- size_t batch_size = features->data->shape[0];
- size_t num_features = features->data->shape[1];
-
- // 使用嵌套循环将 features 的每个元素与 bias 的对应元素相加
- for (size_t i = 0; i < batch_size; ++i) {
- for (size_t j = 0; j < num_features; ++j) {
- // 计算当前元素在一维向量中的索引
- size_t index = i * num_features + j;
- outNode->data[index] = features->data->data[index] + bias->data->data[j];
- }
- }
- // 补全这里的代码
- return outNode;
- }
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- // 获取 features 和 bias 的信息
- auto features = this->objects[0];
- auto bias = this->objects[1];
-
- // 获取 batch_size 和 num_features
- size_t batch_size = features->data->shape[0];
- size_t num_features = features->data->shape[1];
-
- // 计算 grad_features,直接复制 gradient
- auto grad_features = std::make_shared<tensor::Tensor>(features->data->shape);
- grad_features->data = gradient->data;
-
- // 计算 grad_bias,将 gradient 每一列元素相加
- auto grad_bias = std::make_shared<tensor::Tensor>(bias->data->shape);
- for (size_t j = 0; j < num_features; ++j) {
- float column_sum = 0.0f;
- for (size_t i = 0; i < batch_size; ++i) {
- // 计算当前元素在一维向量中的索引
- size_t index = i * num_features + j;
- column_sum += gradient->data[index];
- }
- grad_bias->data[j] = column_sum;
- }
-
- return {grad_features, grad_bias};
- }
- std::vector<float> get_data() {
- return this->data->data;
- }
- }; // class AddBias
-
-
- class Linear: public FunctionNode {
- public:
- Linear(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
- // 这段代码就一行,参考下别的类是怎么写的呢?
- // 在这里补全
- }
- std::shared_ptr<tensor::Tensor> forward() override {
- // features: (batch_size x input_features)
- auto features = this->objects[0];
- // weights: (input_features x output_features)
- auto weights = this->objects[1];
- auto m = features->data->shape[0];
- auto k = features->data->shape[1];
- auto n = weights->data->shape[1];
- // std::cout << m << " " << n << " " << k << std::endl;
- // output: (batch_size x output_features)
- auto shape = {m, n};
- auto outNode = std::make_shared<tensor::Tensor>(shape);
- // 实际上你需要补全的是arith::mm函数,快去找找它在哪里
- // 其余部分不需要动
- arith::mm(features->data->data, weights->data->data, outNode->data, m, k, n);
- return outNode;
- }
-
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- auto features = this->objects[0];
- auto weights = this->objects[1];
- // gradient.shape[0] == features.shape[0]
- // gradient.shape[1] == weights.shape[1]
- auto grad_features_shape = {gradient->shape[0], weights->data->shape[0]};
- auto grad_features = std::make_shared<tensor::Tensor>(grad_features_shape);
- auto grad_weights_shape = {features->data->shape[1], gradient->shape[1]};
- auto grad_weights = std::make_shared<tensor::Tensor>(grad_weights_shape);
- // 这里要调用两次arith:mm,是分别把哪两个矩阵相乘呢?
- return {grad_features, grad_weights};
- }
- }; //class Linear
-
- class ReLU: public FunctionNode {
- public:
- ReLU(std::shared_ptr<Node> a) : FunctionNode(a) {
- // 补全这里
- this->data = this->forward();
- }
- std::shared_ptr<tensor::Tensor> forward() override {
- // x: a Node with shape (batch_size x num_features)
- auto outNode = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape);
- // 补全这里,调用arith::vector_scalar_max
- arith::vector_scalar_max(this->objects[0]->data->data, outNode->data, this->objects[0]->data->size, 0);
- return outNode;
- }
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- auto grads = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape);
- // 补全这里,一个for循环
- for (size_t i = 0; i < this->objects[0]->data->size; ++i) {
- grads->data[i] = (this->objects[0]->data->data[i] > 0) ? gradient->data[i] : 0;
- }
- return {grads};
- }
- }; // class ReLU
-
- class Loss: public FunctionNode {
- public:
- bool used = false;
- public:
- Loss(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {}
- };
-
- class SquareLoss: public Loss {
- public:
- SquareLoss(std::shared_ptr<Node> a, std::shared_ptr<Node> b): Loss(a, b) {
- // 补全这里的代码
- this->data = this->forward();
- }
- std::shared_ptr<tensor::Tensor> forward() {
- // a: a Node with shape (batch_size x dim)
- // b: a Node with shape (batch_size x dim)
- // 这个简单,就是要注意返回的res需要是一个tensor就行
- // 修改下面的代码
- auto a = this->objects[0];
- auto b = this->objects[1];
- float loss = 0.0f;
-
- // 遍历所有元素,计算均方误差损失
- for (size_t i = 0; i < a->data->size; ++i) {
- float diff = a->data->data[i] - b->data->data[i];
- loss += diff * diff;
- }
-
- // 除以 2 得到最终损失
- loss /= 2.0f;
-
- std::vector<size_t> res_shape = {1};
- auto res = std::make_shared<tensor::Tensor>(res_shape);
- res->data[0] = loss;
- return res;
- }
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- float g = gradient->data[0];
- auto a = this->objects[0];
- auto b = this->objects[1];
- auto grad_a = std::make_shared<tensor::Tensor>(a->data->shape);
- auto grad_b = std::make_shared<tensor::Tensor>(b->data->shape);
- // 补全下面的代码
- // 计算元素数量
- size_t size = a->data->size;
- // 遍历所有元素,计算梯度
- for (size_t i = 0; i < size; ++i) {
- float diff = a->data->data[i] - b->data->data[i];
- // 计算 grad_a 的第 i 个元素的梯度
- grad_a->data[i] = g * diff / size;
- // 计算 grad_b 的第 i 个元素的梯度
- grad_b->data[i] = -g * diff / size;
- }
- return {grad_a, grad_b};
- }
- }; // class SquareLoss
-
- std::shared_ptr<tensor::Tensor> log_softmax(std::shared_ptr<tensor::Tensor> logits);
-
- class SoftmaxLoss: public Loss {
- public:
- SoftmaxLoss(std::shared_ptr<Node> logits, std::shared_ptr<Node> labels): Loss(logits, labels) {
- this->data = this->forward();
- }
-
- std::shared_ptr<tensor::Tensor> forward() {
- // 我们已经帮你写好log_softmax
- auto log_probs = log_softmax(this->objects[0]->data);
- // 补全下面的代码,计算softmax loss
- // 获取真实标签
- auto labels = this->objects[1]->data;
- // 初始化损失值
- float loss = 0.0f;
- // 遍历每个样本
- for (size_t i = 0; i < log_probs->shape[0]; ++i) {
- // 遍历每个类别
- for (size_t j = 0; j < log_probs->shape[1]; ++j) {
- // 计算当前样本当前类别的索引
- size_t index = i * log_probs->shape[1] + j;
- // 累加损失值
- loss += -labels->data[index] * log_probs->data[index];
- }
- }
- // 计算平均损失
- loss /= log_probs->shape[0];
- std::vector<size_t> res_shape = {1};
- auto res = std::make_shared<tensor::Tensor>(res_shape);
- return res;
- }
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- auto log_probs = log_softmax(this->objects[0]->data);
- auto labels = this->objects[1]->data;
- auto batch_size = log_probs->shape[0];
- auto num_classes = log_probs->shape[1];
- auto grad_logits = std::make_shared<tensor::Tensor>(log_probs->shape);
- auto grad_labels = std::make_shared<tensor::Tensor>(labels->shape);
-
- // 计算 softmax 概率,因为 log_probs 是 log_softmax 的结果,所以需要 exp 还原
- std::shared_ptr<tensor::Tensor> probs = std::make_shared<tensor::Tensor>(log_probs->shape);
- for (size_t i = 0; i < log_probs->size; ++i) {
- probs->data[i] = std::exp(log_probs->data[i]);
- }
-
- float g = gradient->data[0];
- // 计算 grad_logits
- for (size_t i = 0; i < batch_size; ++i) {
- for (size_t j = 0; j < num_classes; ++j) {
- size_t index = i * num_classes + j;
- // 计算梯度,公式为 softmax(logits)_i - y_true_i
- grad_logits->data[index] = g * (probs->data[index] - labels->data[index]) / batch_size;
- }
- }
-
- // grad_labels 通常不需要计算梯度,设为 0
- std::fill(grad_labels->data.begin(), grad_labels->data.end(), 0);
-
- return {grad_logits, grad_labels};
- }
- }; // class SoftmaxLoss
-
- std::vector<std::shared_ptr<tensor::Tensor>> gradients(std::shared_ptr<Loss> loss, std::vector<std::shared_ptr<Node>> parameters);
-
- }
|