PREScode
/
uc-modern-cpp-student

#pragma once
#include <vector>
#include <memory>
#include <unordered_set>
#include <unordered_map>
#include <algorithm>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <iostream>
#include "../tensor/tensor.h"
#include "../math/arith.h"

namespace py = pybind11;

namespace nn {

class Node {
public:
    std::shared_ptr<tensor::Tensor> data;
    std::vector<std::shared_ptr<Node>> objects;
    std::vector<std::shared_ptr<tensor::Tensor>> gradient;
public:
    Node() {}
    virtual std::shared_ptr<tensor::Tensor> forward() = 0;
    virtual std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) = 0;
    std::vector<std::shared_ptr<Node>> get_parents() {
        return this->objects;
    }
    std::vector<float> get_data() {
        return this->data->data;
    }
    std::shared_ptr<tensor::Tensor> get_tensor() {
        return this->data;
    }
    // virtual void update(std::shared_ptr<tensor::Tensor> grad, float lr) = 0;
    // virtual void zero_grad() = 0;
    virtual ~Node() {}
};

class DataNode: public Node {
public:
    DataNode() {}
}; // class DataNode

class Parameter: public DataNode {
public:
    // Parameter(const std::vector<std::size_t>& shape) {
    //     this->data = std::make_shared<tensor::Tensor>(shape, true);
    // }
    Parameter(py::array_t<float> array) {
        py::buffer_info info = array.request();
        float* dataPtr = static_cast<float*>(info.ptr);
        std::vector<std::size_t> shape = {};
        for (auto &it: info.shape) {
            shape.push_back(it);
        }
        auto tensor = std::make_shared<tensor::Tensor>(shape);
        std::vector<float> result(dataPtr, dataPtr + info.size);
        tensor->data = result;
        this->data = tensor;
    }
    std::shared_ptr<tensor::Tensor> forward() {
        return this->data;
    };
    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) {
        return {gradient};
    };
    void update(std::shared_ptr<tensor::Tensor> grad, double lr) {
        for (auto i = 0; i < this->data->size; i++) {
            this->data->data[i] -= lr * grad->data[i];
        }
    }
}; // class Parameter

class Constant: public DataNode {
public:
    Constant(std::shared_ptr<tensor::Tensor> data) {
        this->data = data;
    }
    Constant(py::array_t<float> array) {
        this->data = tensor::pyarray_to_tensor(array);
    }
    std::shared_ptr<tensor::Tensor> forward() {
        return this->data;
    };
    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) {
        return {gradient};
    };
    // void update(std::shared_ptr<tensor::Tensor> grad, float lr) {}
}; // class Constant

class FunctionNode: public Node {
public:
    FunctionNode(std::shared_ptr<Node> a, std::shared_ptr<Node> b) {
        this->objects.emplace_back(a);
        this->objects.emplace_back(b);
    }
    FunctionNode(std::shared_ptr<Node> a) {
        this->objects.emplace_back(a);
    }

    std::shared_ptr<tensor::Tensor> forward() override {
        return nullptr;
    }
}; //class FunctionNode

class Add: public FunctionNode {
public:
    Add(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
        this->data = this->forward();
    }
    std::shared_ptr<tensor::Tensor> forward() override {
        auto a = this->objects[0];
        auto b = this->objects[1];
        auto outNode = std::make_shared<tensor::Tensor>(a->data->shape);
        for (auto i = 0; i < a->data->size; i++) {
            outNode->data[i] = a->data->data[i] + b->data->data[i];
        }
        return outNode;
    }
    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
        // assertion needed
        return {gradient, gradient};
    }
};

class AddBias: public FunctionNode {
public:
    AddBias(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
        this->data = this->forward();
    }
    std::shared_ptr<tensor::Tensor> forward() override {
        // features: a Node with shape (batch_size x num_features)
        // bias: a Node with shape (1 x num_features)
        auto features = this->objects[0];
        auto bias = this->objects[1];
        auto outNode = std::make_shared<tensor::Tensor>(features->data->shape);
        auto batch_size = features->data->shape[0];
        auto num_features = features->data->shape[1];
        for (size_t i = 0; i < batch_size; ++i) {
            for (size_t j = 0; j < num_features; ++j) {
                // 计算索引：batch_size行，num_features列的二维张量
                size_t idx = i * num_features + j;
                // 每个样本的特征向量加上偏置向量
                outNode->data[idx] = features->data->data[idx] + bias->data->data[j];
            }
        }
        // for循环写加法总会写吧🤔
        // 补全这里的代码
        return outNode;
    }
    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
        // assertion needed
        auto g_bias = std::make_shared<tensor::Tensor>(this->objects[1]->data->shape);  
        // 从张量形状获取维度信息
        auto batch_size = gradient->shape[0];
        auto num_features = gradient->shape[1];  // 从shape中获取num_features
        // 补全这里的代码
    
    // 初始化偏置梯度为零
    for (size_t j = 0; j < num_features; ++j)
     {
        g_bias->data[j] = 0.0f;
    }
    
    // 计算偏置的梯度：对每个特征维度，将所有样本的梯度累加
    for (size_t i = 0; i < batch_size; ++i) {
        for (size_t j = 0; j < num_features; ++j) {
            // 累加每个样本对该特征维度的梯度贡献
            g_bias->data[j] += gradient->data[i * num_features + j];
        }
    }
        return {gradient, g_bias};
    }
    std::vector<float> get_data() {
        return this->data->data;
    }
}; // class AddBias


class Linear: public FunctionNode {
public:
    Linear(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
        this->data=this->forward(); 
        // 这段代码就一行，参考下别的类是怎么写的呢？
        // 在这里补全
    }
    std::shared_ptr<tensor::Tensor> forward() override {
        // features: (batch_size x input_features)
        auto features = this->objects[0];
        // weights: (input_features x output_features)
        auto weights = this->objects[1];
        auto m = features->data->shape[0];
        auto k = features->data->shape[1];
        auto n = weights->data->shape[1];
        // std::cout << m << " " << n << " " << k << std::endl;
        // output: (batch_size x output_features)
        auto shape = {m, n};
        auto outNode = std::make_shared<tensor::Tensor>(shape);
        // 实际上你需要补全的是arith::mm函数，快去找找它在哪里
        // 其余部分不需要动
        arith::mm(features->data->data, weights->data->data, outNode->data, m, k, n);
        return outNode;
    }

    // 辅助函数：矩阵转置
    template<typename T>
    std::vector<T> transpose(const std::vector<T>& mat, size_t rows, size_t cols) {
        std::vector<T> result(rows * cols);
        for (size_t i = 0; i < rows; ++i) {
            for (size_t j = 0; j < cols; ++j) {
                result[j * rows + i] = mat[i * cols + j];
            }
        }
        return result;
    }

    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
        auto features = this->objects[0];
        auto weights = this->objects[1];
        // gradient.shape[0] == features.shape[0]
        // gradient.shape[1] == weights.shape[1]
        auto grad_features_shape = {gradient->shape[0], weights->data->shape[0]};
        auto grad_features = std::make_shared<tensor::Tensor>(grad_features_shape);
        auto grad_weights_shape = {features->data->shape[1], gradient->shape[1]};
        auto grad_weights = std::make_shared<tensor::Tensor>(grad_weights_shape);

        // 计算输入特征的梯度
        // grad_features = gradient * weights^T
        auto weights_transposed = transpose(weights->data->data, weights->data->shape[0], weights->data->shape[1]);
        size_t m = gradient->shape[0];
        size_t k = weights->data->shape[1];
        size_t n = weights->data->shape[0];
        arith::mm(gradient->data, weights_transposed, grad_features->data, m, k, n);

        // 计算权重的梯度
        // grad_weights = features^T * gradient
        auto features_transposed = transpose(features->data->data, features->data->shape[0], features->data->shape[1]);
        m = features->data->shape[1];
        k = features->data->shape[0];
        n = gradient->shape[1];
        arith::mm(features_transposed, gradient->data, grad_weights->data, m, k, n);

        return {grad_features, grad_weights};
    }
}; //class Linear

class ReLU: public FunctionNode {
public:
    ReLU(std::shared_ptr<Node> a) : FunctionNode(a) {
        // 补全这里
        this->data = this->forward();
    }
    std::shared_ptr<tensor::Tensor> forward() override {
        // x: a Node with shape (batch_size x num_features)
        auto outNode = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape);
        // 补全这里，调用arith::vector_scalar_max
        arith::vector_scalar_max(this->objects[0]->data->data, outNode->data, 0.0f);
        return outNode;
    }
    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
        auto grads = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape);
        // 补全这里，一个for循环
        for (size_t i = 0; i < grads->size; ++i) {
            if (this->objects[0]->data->data[i] > 0) {
                grads->data[i] = gradient->data[i];
            } else {
                grads->data[i] = 0;
            }
        }
        return {grads};
    }
}; // class ReLU

class Loss: public FunctionNode {
public:
    bool used = false;
public:
    Loss(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {}
};

class SquareLoss: public Loss {
public:
    SquareLoss(std::shared_ptr<Node> a, std::shared_ptr<Node> b): Loss(a, b) {
        // 补全这里的代码
        this->data = this->forward();
    }
    std::shared_ptr<tensor::Tensor> forward() {
        // a: a Node with shape (batch_size x dim)
        // b: a Node with shape (batch_size x dim)
        // 这个简单，就是要注意返回的res需要是一个tensor就行
        auto a = this->objects[0]->data;
        auto b = this->objects[1]->data;
        float sum_squared_diff = 0.0f;
        for (size_t i = 0; i < a->size; ++i) {
            float diff = a->data[i] - b->data[i];
            sum_squared_diff += diff * diff;
        }
        // 修改下面的代码
        float square_loss = sum_squared_diff / a->size;
        std::vector<size_t> res_shape = {1};
        auto res = std::make_shared<tensor::Tensor>(res_shape);
        res->data[0] = square_loss;
        return res;
    }
    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
        float g = gradient->data[0];
        auto a = this->objects[0];
        auto b = this->objects[1];
        auto grad_a = std::make_shared<tensor::Tensor>(a->data->shape);
        auto grad_b = std::make_shared<tensor::Tensor>(b->data->shape);
        // 补全下面的代码
        size_t n = a->data->size;
        for (size_t i = 0; i < n; ++i) {
            float diff = a->data->data[i] - b->data->data[i];
            grad_a->data[i] = g * (2.0f / n) * diff;
            grad_b->data[i] = -g * (2.0f / n) * diff;
        }
        return {grad_a, grad_b};
    }
}; // class SquareLoss

std::shared_ptr<tensor::Tensor> log_softmax(std::shared_ptr<tensor::Tensor> logits);

class SoftmaxLoss: public Loss {
public:
    SoftmaxLoss(std::shared_ptr<Node> logits, std::shared_ptr<Node> labels): Loss(logits, labels) {
        this->data = this->forward();
    }

    std::shared_ptr<tensor::Tensor> forward() {
        // 我们已经帮你写好log_softmax
        auto log_probs = log_softmax(this->objects[0]->data);
        // 补全下面的代码，计算softmax loss
        auto labels = this->objects[1]->data;
        // 样本数量
        auto batch_size = log_probs->shape[0];
        // 类别数量
        auto num_classes = log_probs->shape[1];
        // 初始化损失值
        float loss = 0.0f;

        // 计算 softmax 损失
        for (size_t i = 0; i < batch_size; ++i) {
            for (size_t j = 0; j < num_classes; ++j) {
                // 计算索引
                size_t idx = i * num_classes + j;
                // 累加损失
                loss += labels->data[idx] * log_probs->data[idx];
            }
        }
        // 求平均损失
        loss = -loss / batch_size;

        std::vector<size_t> res_shape = {1};
        auto res = std::make_shared<tensor::Tensor>(res_shape);
        res->data[0] = loss;
        return res;
    }
    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
        auto log_probs = log_softmax(this->objects[0]->data);
        std::vector<float> probs(log_probs->data.size());
        for (size_t i = 0; i < log_probs->data.size(); ++i) {
            probs[i] = std::exp(log_probs->data[i]);
        }
        auto labels = this->objects[1]->data;
        auto batch_size = log_probs->shape[0];
        auto num_classes = log_probs->shape[1];
        auto grad_logits = std::make_shared<tensor::Tensor>(log_probs->shape);
        auto grad_labels = std::make_shared<tensor::Tensor>(labels->shape);
        // 补全下面的代码
        for (size_t i = 0; i < batch_size; ++i) {
            for (size_t j = 0; j < num_classes; ++j) {
                size_t idx = i * num_classes + j;
                // 根据公式计算梯度
                grad_logits->data[idx] = (probs[idx] - labels->data[idx]) / batch_size;
            }
        }

        std::fill(grad_labels->data.begin(), grad_labels->data.end(), 0.0f);
        return {grad_logits, grad_labels};
    }
}; // class SoftmaxLoss

std::vector<std::shared_ptr<tensor::Tensor>> gradients(std::shared_ptr<Loss> loss, std::vector<std::shared_ptr<Node>> parameters);

}