|
- #pragma once
- #include <vector>
- #include <memory>
- #include <unordered_set>
- #include <unordered_map>
- #include <algorithm>
- #include <pybind11/pybind11.h>
- #include <pybind11/numpy.h>
- #include <iostream>
- #include "../tensor/tensor.h"
- #include "../math/arith.h"
-
- namespace py = pybind11;
-
- namespace nn {
-
- class Node {
- public:
- std::shared_ptr<tensor::Tensor> data;
- std::vector<std::shared_ptr<Node>> objects;
- std::vector<std::shared_ptr<tensor::Tensor>> gradient;
- public:
- Node() {}
- virtual std::shared_ptr<tensor::Tensor> forward() = 0;
- virtual std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) = 0;
- std::vector<std::shared_ptr<Node>> get_parents() {
- return this->objects;
- }
- std::vector<float> get_data() {
- return this->data->data;
- }
- std::shared_ptr<tensor::Tensor> get_tensor() {
- return this->data;
- }
- // virtual void update(std::shared_ptr<tensor::Tensor> grad, float lr) = 0;
- // virtual void zero_grad() = 0;
- virtual ~Node() {}
- };
-
- class DataNode: public Node {
- public:
- DataNode() {}
- }; // class DataNode
-
- class Parameter: public DataNode {
- public:
- // Parameter(const std::vector<std::size_t>& shape) {
- // this->data = std::make_shared<tensor::Tensor>(shape, true);
- // }
- Parameter(py::array_t<float> array) {
- py::buffer_info info = array.request();
- float* dataPtr = static_cast<float*>(info.ptr);
- std::vector<std::size_t> shape = {};
- for (auto &it: info.shape) {
- shape.push_back(it);
- }
- auto tensor = std::make_shared<tensor::Tensor>(shape);
- std::vector<float> result(dataPtr, dataPtr + info.size);
- tensor->data = result;
- this->data = tensor;
- }
- std::shared_ptr<tensor::Tensor> forward() {
- return this->data;
- };
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) {
- return {gradient};
- };
- void update(std::shared_ptr<tensor::Tensor> grad, double lr) {
- for (auto i = 0; i < this->data->size; i++) {
- this->data->data[i] -= lr * grad->data[i];
- }
- }
- }; // class Parameter
-
- class Constant: public DataNode {
- public:
- Constant(std::shared_ptr<tensor::Tensor> data) {
- this->data = data;
- }
- Constant(py::array_t<float> array) {
- this->data = tensor::pyarray_to_tensor(array);
- }
- std::shared_ptr<tensor::Tensor> forward() {
- return this->data;
- };
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) {
- return {gradient};
- };
- // void update(std::shared_ptr<tensor::Tensor> grad, float lr) {}
- }; // class Constant
-
- class FunctionNode: public Node {
- public:
- FunctionNode(std::shared_ptr<Node> a, std::shared_ptr<Node> b) {
- this->objects.emplace_back(a);
- this->objects.emplace_back(b);
- }
- FunctionNode(std::shared_ptr<Node> a) {
- this->objects.emplace_back(a);
- }
-
- std::shared_ptr<tensor::Tensor> forward() override {
- return nullptr;
- }
- }; //class FunctionNode
-
- class Add: public FunctionNode {
- public:
- Add(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
- this->data = this->forward();
- }
- std::shared_ptr<tensor::Tensor> forward() override {
- auto a = this->objects[0];
- auto b = this->objects[1];
- auto outNode = std::make_shared<tensor::Tensor>(a->data->shape);
- for (auto i = 0; i < a->data->size; i++) {
- outNode->data[i] = a->data->data[i] + b->data->data[i];
- }
- return outNode;
- }
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- // assertion needed
- return {gradient, gradient};
- }
- };
-
- class AddBias: public FunctionNode {
- public:
- AddBias(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
- this->data = this->forward();
- }
- std::shared_ptr<tensor::Tensor> forward() override {
- // features: a Node with shape (batch_size x num_features)
- // bias: a Node with shape (1 x num_features)
- auto features = this->objects[0];
- auto bias = this->objects[1];
- auto outNode = std::make_shared<tensor::Tensor>(features->data->shape);
- auto batch_size = features->data->shape[0];
- auto num_features = features->data->shape[1];
- for (size_t i = 0; i < batch_size; ++i) {
- for (size_t j = 0; j < num_features; ++j) {
- // 计算索引:batch_size行,num_features列的二维张量
- size_t idx = i * num_features + j;
- // 每个样本的特征向量加上偏置向量
- outNode->data[idx] = features->data->data[idx] + bias->data->data[j];
- }
- }
- // for循环写加法总会写吧🤔
- // 补全这里的代码
- return outNode;
- }
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- // assertion needed
- auto g_bias = std::make_shared<tensor::Tensor>(this->objects[1]->data->shape);
- // 从张量形状获取维度信息
- auto batch_size = gradient->shape[0];
- auto num_features = gradient->shape[1]; // 从shape中获取num_features
- // 补全这里的代码
-
- // 初始化偏置梯度为零
- for (size_t j = 0; j < num_features; ++j)
- {
- g_bias->data[j] = 0.0f;
- }
-
- // 计算偏置的梯度:对每个特征维度,将所有样本的梯度累加
- for (size_t i = 0; i < batch_size; ++i) {
- for (size_t j = 0; j < num_features; ++j) {
- // 累加每个样本对该特征维度的梯度贡献
- g_bias->data[j] += gradient->data[i * num_features + j];
- }
- }
- return {gradient, g_bias};
- }
- std::vector<float> get_data() {
- return this->data->data;
- }
- }; // class AddBias
-
-
- class Linear: public FunctionNode {
- public:
- Linear(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
- this->data=this->forward();
- // 这段代码就一行,参考下别的类是怎么写的呢?
- // 在这里补全
- }
- std::shared_ptr<tensor::Tensor> forward() override {
- // features: (batch_size x input_features)
- auto features = this->objects[0];
- // weights: (input_features x output_features)
- auto weights = this->objects[1];
- auto m = features->data->shape[0];
- auto k = features->data->shape[1];
- auto n = weights->data->shape[1];
- // std::cout << m << " " << n << " " << k << std::endl;
- // output: (batch_size x output_features)
- auto shape = {m, n};
- auto outNode = std::make_shared<tensor::Tensor>(shape);
- // 实际上你需要补全的是arith::mm函数,快去找找它在哪里
- // 其余部分不需要动
- arith::mm(features->data->data, weights->data->data, outNode->data, m, k, n);
- return outNode;
- }
-
- // 辅助函数:矩阵转置
- template<typename T>
- std::vector<T> transpose(const std::vector<T>& mat, size_t rows, size_t cols) {
- std::vector<T> result(rows * cols);
- for (size_t i = 0; i < rows; ++i) {
- for (size_t j = 0; j < cols; ++j) {
- result[j * rows + i] = mat[i * cols + j];
- }
- }
- return result;
- }
-
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- auto features = this->objects[0];
- auto weights = this->objects[1];
- // gradient.shape[0] == features.shape[0]
- // gradient.shape[1] == weights.shape[1]
- auto grad_features_shape = {gradient->shape[0], weights->data->shape[0]};
- auto grad_features = std::make_shared<tensor::Tensor>(grad_features_shape);
- auto grad_weights_shape = {features->data->shape[1], gradient->shape[1]};
- auto grad_weights = std::make_shared<tensor::Tensor>(grad_weights_shape);
-
- // 计算输入特征的梯度
- // grad_features = gradient * weights^T
- auto weights_transposed = transpose(weights->data->data, weights->data->shape[0], weights->data->shape[1]);
- size_t m = gradient->shape[0];
- size_t k = weights->data->shape[1];
- size_t n = weights->data->shape[0];
- arith::mm(gradient->data, weights_transposed, grad_features->data, m, k, n);
-
- // 计算权重的梯度
- // grad_weights = features^T * gradient
- auto features_transposed = transpose(features->data->data, features->data->shape[0], features->data->shape[1]);
- m = features->data->shape[1];
- k = features->data->shape[0];
- n = gradient->shape[1];
- arith::mm(features_transposed, gradient->data, grad_weights->data, m, k, n);
-
- return {grad_features, grad_weights};
- }
- }; //class Linear
-
- class ReLU: public FunctionNode {
- public:
- ReLU(std::shared_ptr<Node> a) : FunctionNode(a) {
- // 补全这里
- this->data = this->forward();
- }
- std::shared_ptr<tensor::Tensor> forward() override {
- // x: a Node with shape (batch_size x num_features)
- auto outNode = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape);
- // 补全这里,调用arith::vector_scalar_max
- arith::vector_scalar_max(this->objects[0]->data->data, outNode->data, 0.0f);
- return outNode;
- }
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- auto grads = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape);
- // 补全这里,一个for循环
- for (size_t i = 0; i < grads->size; ++i) {
- if (this->objects[0]->data->data[i] > 0) {
- grads->data[i] = gradient->data[i];
- } else {
- grads->data[i] = 0;
- }
- }
- return {grads};
- }
- }; // class ReLU
-
- class Loss: public FunctionNode {
- public:
- bool used = false;
- public:
- Loss(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {}
- };
-
- class SquareLoss: public Loss {
- public:
- SquareLoss(std::shared_ptr<Node> a, std::shared_ptr<Node> b): Loss(a, b) {
- // 补全这里的代码
- this->data = this->forward();
- }
- std::shared_ptr<tensor::Tensor> forward() {
- // a: a Node with shape (batch_size x dim)
- // b: a Node with shape (batch_size x dim)
- // 这个简单,就是要注意返回的res需要是一个tensor就行
- auto a = this->objects[0]->data;
- auto b = this->objects[1]->data;
- float sum_squared_diff = 0.0f;
- for (size_t i = 0; i < a->size; ++i) {
- float diff = a->data[i] - b->data[i];
- sum_squared_diff += diff * diff;
- }
- // 修改下面的代码
- float square_loss = sum_squared_diff / a->size;
- std::vector<size_t> res_shape = {1};
- auto res = std::make_shared<tensor::Tensor>(res_shape);
- res->data[0] = square_loss;
- return res;
- }
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- float g = gradient->data[0];
- auto a = this->objects[0];
- auto b = this->objects[1];
- auto grad_a = std::make_shared<tensor::Tensor>(a->data->shape);
- auto grad_b = std::make_shared<tensor::Tensor>(b->data->shape);
- // 补全下面的代码
- size_t n = a->data->size;
- for (size_t i = 0; i < n; ++i) {
- float diff = a->data->data[i] - b->data->data[i];
- grad_a->data[i] = g * (2.0f / n) * diff;
- grad_b->data[i] = -g * (2.0f / n) * diff;
- }
- return {grad_a, grad_b};
- }
- }; // class SquareLoss
-
- std::shared_ptr<tensor::Tensor> log_softmax(std::shared_ptr<tensor::Tensor> logits);
-
- class SoftmaxLoss: public Loss {
- public:
- SoftmaxLoss(std::shared_ptr<Node> logits, std::shared_ptr<Node> labels): Loss(logits, labels) {
- this->data = this->forward();
- }
-
- std::shared_ptr<tensor::Tensor> forward() {
- // 我们已经帮你写好log_softmax
- auto log_probs = log_softmax(this->objects[0]->data);
- // 补全下面的代码,计算softmax loss
- auto labels = this->objects[1]->data;
- // 样本数量
- auto batch_size = log_probs->shape[0];
- // 类别数量
- auto num_classes = log_probs->shape[1];
- // 初始化损失值
- float loss = 0.0f;
-
- // 计算 softmax 损失
- for (size_t i = 0; i < batch_size; ++i) {
- for (size_t j = 0; j < num_classes; ++j) {
- // 计算索引
- size_t idx = i * num_classes + j;
- // 累加损失
- loss += labels->data[idx] * log_probs->data[idx];
- }
- }
- // 求平均损失
- loss = -loss / batch_size;
-
- std::vector<size_t> res_shape = {1};
- auto res = std::make_shared<tensor::Tensor>(res_shape);
- res->data[0] = loss;
- return res;
- }
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- auto log_probs = log_softmax(this->objects[0]->data);
- std::vector<float> probs(log_probs->data.size());
- for (size_t i = 0; i < log_probs->data.size(); ++i) {
- probs[i] = std::exp(log_probs->data[i]);
- }
- auto labels = this->objects[1]->data;
- auto batch_size = log_probs->shape[0];
- auto num_classes = log_probs->shape[1];
- auto grad_logits = std::make_shared<tensor::Tensor>(log_probs->shape);
- auto grad_labels = std::make_shared<tensor::Tensor>(labels->shape);
- // 补全下面的代码
- for (size_t i = 0; i < batch_size; ++i) {
- for (size_t j = 0; j < num_classes; ++j) {
- size_t idx = i * num_classes + j;
- // 根据公式计算梯度
- grad_logits->data[idx] = (probs[idx] - labels->data[idx]) / batch_size;
- }
- }
-
- std::fill(grad_labels->data.begin(), grad_labels->data.end(), 0.0f);
- return {grad_logits, grad_labels};
- }
- }; // class SoftmaxLoss
-
- std::vector<std::shared_ptr<tensor::Tensor>> gradients(std::shared_ptr<Loss> loss, std::vector<std::shared_ptr<Node>> parameters);
-
- }
|