#pragma once #include #include #include #include #include #include #include #include #include "../tensor/tensor.h" #include "../math/arith.h" namespace py = pybind11; namespace nn { class Node { public: std::shared_ptr data; std::vector> objects; std::vector> gradient; public: Node() {} virtual std::shared_ptr forward() = 0; virtual std::vector> backward(std::shared_ptr gradient) = 0; std::vector> get_parents() { return this->objects; } std::vector get_data() { return this->data->data; } std::shared_ptr get_tensor() { return this->data; } // virtual void update(std::shared_ptr grad, float lr) = 0; // virtual void zero_grad() = 0; virtual ~Node() {} }; class DataNode: public Node { public: DataNode() {} }; // class DataNode class Parameter: public DataNode { public: // Parameter(const std::vector& shape) { // this->data = std::make_shared(shape, true); // } Parameter(py::array_t array) { py::buffer_info info = array.request(); float* dataPtr = static_cast(info.ptr); std::vector shape = {}; for (auto &it: info.shape) { shape.push_back(it); } auto tensor = std::make_shared(shape); std::vector result(dataPtr, dataPtr + info.size); tensor->data = result; this->data = tensor; } std::shared_ptr forward() { return this->data; }; std::vector> backward(std::shared_ptr gradient) { return {gradient}; }; void update(std::shared_ptr grad, double lr) { for (auto i = 0; i < this->data->size; i++) { this->data->data[i] -= lr * grad->data[i]; } } }; // class Parameter class Constant: public DataNode { public: Constant(std::shared_ptr data) { this->data = data; } Constant(py::array_t array) { this->data = tensor::pyarray_to_tensor(array); } std::shared_ptr forward() { return this->data; }; std::vector> backward(std::shared_ptr gradient) { return {gradient}; }; // void update(std::shared_ptr grad, float lr) {} }; // class Constant class FunctionNode: public Node { public: FunctionNode(std::shared_ptr a, std::shared_ptr b) { this->objects.emplace_back(a); this->objects.emplace_back(b); } FunctionNode(std::shared_ptr a) { this->objects.emplace_back(a); } std::shared_ptr forward() override { return nullptr; } }; //class FunctionNode class Add: public FunctionNode { public: Add(std::shared_ptr a, std::shared_ptr b) : FunctionNode(a, b) { this->data = this->forward(); } std::shared_ptr forward() override { auto a = this->objects[0]; auto b = this->objects[1]; auto outNode = std::make_shared(a->data->shape); for (auto i = 0; i < a->data->size; i++) { outNode->data[i] = a->data->data[i] + b->data->data[i]; } return outNode; } std::vector> backward(std::shared_ptr gradient) override { // assertion needed return {gradient, gradient}; } }; class AddBias: public FunctionNode { public: AddBias(std::shared_ptr a, std::shared_ptr b) : FunctionNode(a, b) { this->data = this->forward(); } std::shared_ptr forward() override { // features: a Node with shape (batch_size x num_features) // bias: a Node with shape (1 x num_features) auto features = this->objects[0]; auto bias = this->objects[1]; auto outNode = std::make_shared(features->data->shape); auto batch_size = features->data->shape[0]; auto num_features = features->data->shape[1]; for (size_t i = 0; i < batch_size; ++i) { for (size_t j = 0; j < num_features; ++j) { // 计算索引：batch_size行，num_features列的二维张量 size_t idx = i * num_features + j; // 每个样本的特征向量加上偏置向量 outNode->data[idx] = features->data->data[idx] + bias->data->data[j]; } } // for循环写加法总会写吧🤔 // 补全这里的代码 return outNode; } std::vector> backward(std::shared_ptr gradient) override { // assertion needed auto g_bias = std::make_shared(this->objects[1]->data->shape); // 从张量形状获取维度信息 auto batch_size = gradient->shape[0]; auto num_features = gradient->shape[1]; // 从shape中获取num_features // 补全这里的代码 auto batch_size = gradient->shape[0]; auto num_features = gradient->shape[1]; // 初始化偏置梯度为零 for (size_t j = 0; j < num_features; ++j) { g_bias->data[j] = 0.0f; } // 计算偏置的梯度：对每个特征维度，将所有样本的梯度累加 for (size_t i = 0; i < batch_size; ++i) { for (size_t j = 0; j < num_features; ++j) { // 累加每个样本对该特征维度的梯度贡献 g_bias->data[j] += gradient->data[i * num_features + j]; } } return {gradient, g_bias}; } std::vector get_data() { return this->data->data; } }; // class AddBias class Linear: public FunctionNode { public: Linear(std::shared_ptr a, std::shared_ptr b) : FunctionNode(a, b) { this->data=this->forward(); // 这段代码就一行，参考下别的类是怎么写的呢？ // 在这里补全 } std::shared_ptr forward() override { // features: (batch_size x input_features) auto features = this->objects[0]; // weights: (input_features x output_features) auto weights = this->objects[1]; auto m = features->data->shape[0]; auto k = features->data->shape[1]; auto n = weights->data->shape[1]; // std::cout << m << " " << n << " " << k << std::endl; // output: (batch_size x output_features) auto shape = {m, n}; auto outNode = std::make_shared(shape); // 实际上你需要补全的是arith::mm函数，快去找找它在哪里 // 其余部分不需要动 arith::mm(features->data->data, weights->data->data, outNode->data, m, k, n); return outNode; } std::vector> backward(std::shared_ptr gradient) override { auto features = this->objects[0]; auto weights = this->objects[1]; // gradient.shape[0] == features.shape[0] // gradient.shape[1] == weights.shape[1] auto grad_features_shape = {gradient->shape[0], weights->data->shape[0]}; auto grad_features = std::make_shared(grad_features_shape); auto grad_weights_shape = {features->data->shape[1], gradient->shape[1]}; auto grad_weights = std::make_shared(grad_weights_shape); // 这里要调用两次arith:mm，是分别把哪两个矩阵相乘呢？ return {grad_features, grad_weights}; } }; //class Linear class ReLU: public FunctionNode { public: ReLU(std::shared_ptr a) : FunctionNode(a) { // 补全这里 } std::shared_ptr forward() override { // x: a Node with shape (batch_size x num_features) auto outNode = std::make_shared(this->objects[0]->data->shape); // 补全这里，调用arith::vector_scalar_max return outNode; } std::vector> backward(std::shared_ptr gradient) override { auto grads = std::make_shared(this->objects[0]->data->shape); // 补全这里，一个for循环 return {grads}; } }; // class ReLU class Loss: public FunctionNode { public: bool used = false; public: Loss(std::shared_ptr a, std::shared_ptr b) : FunctionNode(a, b) {} }; class SquareLoss: public Loss { public: SquareLoss(std::shared_ptr a, std::shared_ptr b): Loss(a, b) { // 补全这里的代码 } std::shared_ptr forward() { // a: a Node with shape (batch_size x dim) // b: a Node with shape (batch_size x dim) // 这个简单，就是要注意返回的res需要是一个tensor就行 // 修改下面的代码 std::vector res_shape = {1}; auto res = std::make_shared(res_shape); return res; } std::vector> backward(std::shared_ptr gradient) override { float g = gradient->data[0]; auto a = this->objects[0]; auto b = this->objects[1]; auto grad_a = std::make_shared(a->data->shape); auto grad_b = std::make_shared(b->data->shape); // 补全下面的代码 return {grad_a, grad_b}; } }; // class SquareLoss std::shared_ptr log_softmax(std::shared_ptr logits); class SoftmaxLoss: public Loss { public: SoftmaxLoss(std::shared_ptr logits, std::shared_ptr labels): Loss(logits, labels) { this->data = this->forward(); } std::shared_ptr forward() { // 我们已经帮你写好log_softmax auto log_probs = log_softmax(this->objects[0]->data); // 补全下面的代码，计算softmax loss std::vector res_shape = {1}; auto res = std::make_shared(res_shape); return res; } std::vector> backward(std::shared_ptr gradient) override { auto log_probs = log_softmax(this->objects[0]->data); auto labels = this->objects[1]->data; auto batch_size = log_probs->shape[0]; auto num_classes = log_probs->shape[1]; auto grad_logits = std::make_shared(log_probs->shape); auto grad_labels = std::make_shared(labels->shape); // 补全下面的代码 return {grad_logits, grad_labels}; } }; // class SoftmaxLoss std::vector> gradients(std::shared_ptr loss, std::vector> parameters); }