|
- #pragma once
- #include <vector>
- #include <memory>
- #include <unordered_set>
- #include <unordered_map>
- #include <algorithm>
- #include <pybind11/pybind11.h>
- #include <pybind11/numpy.h>
- #include <iostream>
- #include "../tensor/tensor.h"
- #include "../math/arith.h"
-
- namespace py = pybind11;
-
- namespace nn {
-
- class Node {
- public:
- std::shared_ptr<tensor::Tensor> data;
- std::vector<std::shared_ptr<Node>> objects;
- std::vector<std::shared_ptr<tensor::Tensor>> gradient;
- public:
- Node() {}
- virtual std::shared_ptr<tensor::Tensor> forward() = 0;
- virtual std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) = 0;
- std::vector<std::shared_ptr<Node>> get_parents() {
- return this->objects;
- }
- std::vector<float> get_data() {
- return this->data->data;
- }
- std::shared_ptr<tensor::Tensor> get_tensor() {
- return this->data;
- }
- // virtual void update(std::shared_ptr<tensor::Tensor> grad, float lr) = 0;
- // virtual void zero_grad() = 0;
- virtual ~Node() {}
- };
-
- class DataNode: public Node {
- public:
- DataNode() {}
- }; // class DataNode
-
- class Parameter: public DataNode {
- public:
- // Parameter(const std::vector<std::size_t>& shape) {
- // this->data = std::make_shared<tensor::Tensor>(shape, true);
- // }
- Parameter(py::array_t<float> array) {
- py::buffer_info info = array.request();
- float* dataPtr = static_cast<float*>(info.ptr);
- std::vector<std::size_t> shape = {};
- for (auto &it: info.shape) {
- shape.push_back(it);
- }
- auto tensor = std::make_shared<tensor::Tensor>(shape);
- std::vector<float> result(dataPtr, dataPtr + info.size);
- tensor->data = result;
- this->data = tensor;
- }
- std::shared_ptr<tensor::Tensor> forward() {
- return this->data;
- };
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) {
- return {gradient};
- };
- void update(std::shared_ptr<tensor::Tensor> grad, double lr) {
- for (auto i = 0; i < this->data->size; i++) {
- this->data->data[i] -= lr * grad->data[i];
- }
- }
- }; // class Parameter
-
- class Constant: public DataNode {
- public:
- Constant(std::shared_ptr<tensor::Tensor> data) {
- this->data = data;
- }
- Constant(py::array_t<float> array) {
- this->data = tensor::pyarray_to_tensor(array);
- }
- std::shared_ptr<tensor::Tensor> forward() {
- return this->data;
- };
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) {
- return {gradient};
- };
- // void update(std::shared_ptr<tensor::Tensor> grad, float lr) {}
- }; // class Constant
-
- class FunctionNode: public Node {
- public:
- FunctionNode(std::shared_ptr<Node> a, std::shared_ptr<Node> b) {
- this->objects.emplace_back(a);
- this->objects.emplace_back(b);
- }
- FunctionNode(std::shared_ptr<Node> a) {
- this->objects.emplace_back(a);
- }
-
- std::shared_ptr<tensor::Tensor> forward() override {
- return nullptr;
- }
- }; //class FunctionNode
-
- class Add: public FunctionNode {
- public:
- Add(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
- this->data = this->forward();
- }
- std::shared_ptr<tensor::Tensor> forward() override {
- auto a = this->objects[0];
- auto b = this->objects[1];
- auto outNode = std::make_shared<tensor::Tensor>(a->data->shape);
- for (auto i = 0; i < a->data->size; i++) {
- outNode->data[i] = a->data->data[i] + b->data->data[i];
- }
- return outNode;
- }
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- // assertion needed
- return {gradient, gradient};
- }
- };
-
- class AddBias: public FunctionNode {
- public:
- AddBias(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
- this->data = this->forward();
- }
- std::shared_ptr<tensor::Tensor> forward() override {
- // features: a Node with shape (batch_size x num_features)
- // bias: a Node with shape (1 x num_features)
- auto features = this->objects[0];
- auto bias = this->objects[1];
- auto outNode = std::make_shared<tensor::Tensor>(features->data->shape);
- auto batch_size = features->data->shape[0];
- auto num_features = features->data->shape[1];
- for (size_t i = 0; i < batch_size; ++i) {
- for (size_t j = 0; j < num_features; ++j) {
- // 计算索引:batch_size行,num_features列的二维张量
- size_t idx = i * num_features + j;
- // 每个样本的特征向量加上偏置向量
- outNode->data[idx] = features->data->data[idx] + bias->data->data[j];
- }
- }
- // for循环写加法总会写吧🤔
- // 补全这里的代码
- return outNode;
- }
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- // assertion needed
- auto g_bias = std::make_shared<tensor::Tensor>(this->objects[1]->data->shape);
- // 从张量形状获取维度信息
- auto batch_size = gradient->shape[0];
- auto num_features = gradient->shape[1]; // 从shape中获取num_features
- // 补全这里的代码
- auto batch_size = gradient->shape[0];
- auto num_features = gradient->shape[1];
-
- // 初始化偏置梯度为零
- for (size_t j = 0; j < num_features; ++j)
- {
- g_bias->data[j] = 0.0f;
- }
-
- // 计算偏置的梯度:对每个特征维度,将所有样本的梯度累加
- for (size_t i = 0; i < batch_size; ++i) {
- for (size_t j = 0; j < num_features; ++j) {
- // 累加每个样本对该特征维度的梯度贡献
- g_bias->data[j] += gradient->data[i * num_features + j];
- }
- }
- return {gradient, g_bias};
- }
- std::vector<float> get_data() {
- return this->data->data;
- }
- }; // class AddBias
-
-
- class Linear: public FunctionNode {
- public:
- Linear(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
- this->data=this->forward();
- // 这段代码就一行,参考下别的类是怎么写的呢?
- // 在这里补全
- }
- std::shared_ptr<tensor::Tensor> forward() override {
- // features: (batch_size x input_features)
- auto features = this->objects[0];
- // weights: (input_features x output_features)
- auto weights = this->objects[1];
- auto m = features->data->shape[0];
- auto k = features->data->shape[1];
- auto n = weights->data->shape[1];
- // std::cout << m << " " << n << " " << k << std::endl;
- // output: (batch_size x output_features)
- auto shape = {m, n};
- auto outNode = std::make_shared<tensor::Tensor>(shape);
- // 实际上你需要补全的是arith::mm函数,快去找找它在哪里
- // 其余部分不需要动
- arith::mm(features->data->data, weights->data->data, outNode->data, m, k, n);
- return outNode;
- }
-
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- auto features = this->objects[0];
- auto weights = this->objects[1];
- // gradient.shape[0] == features.shape[0]
- // gradient.shape[1] == weights.shape[1]
- auto grad_features_shape = {gradient->shape[0], weights->data->shape[0]};
- auto grad_features = std::make_shared<tensor::Tensor>(grad_features_shape);
- auto grad_weights_shape = {features->data->shape[1], gradient->shape[1]};
- auto grad_weights = std::make_shared<tensor::Tensor>(grad_weights_shape);
- // 这里要调用两次arith:mm,是分别把哪两个矩阵相乘呢?
- return {grad_features, grad_weights};
- }
- }; //class Linear
-
- class ReLU: public FunctionNode {
- public:
- ReLU(std::shared_ptr<Node> a) : FunctionNode(a) {
- // 补全这里
- }
- std::shared_ptr<tensor::Tensor> forward() override {
- // x: a Node with shape (batch_size x num_features)
- auto outNode = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape);
- // 补全这里,调用arith::vector_scalar_max
- return outNode;
- }
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- auto grads = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape);
- // 补全这里,一个for循环
-
- return {grads};
- }
- }; // class ReLU
-
- class Loss: public FunctionNode {
- public:
- bool used = false;
- public:
- Loss(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {}
- };
-
- class SquareLoss: public Loss {
- public:
- SquareLoss(std::shared_ptr<Node> a, std::shared_ptr<Node> b): Loss(a, b) {
- // 补全这里的代码
- }
- std::shared_ptr<tensor::Tensor> forward() {
- // a: a Node with shape (batch_size x dim)
- // b: a Node with shape (batch_size x dim)
- // 这个简单,就是要注意返回的res需要是一个tensor就行
- // 修改下面的代码
- std::vector<size_t> res_shape = {1};
- auto res = std::make_shared<tensor::Tensor>(res_shape);
- return res;
- }
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- float g = gradient->data[0];
- auto a = this->objects[0];
- auto b = this->objects[1];
- auto grad_a = std::make_shared<tensor::Tensor>(a->data->shape);
- auto grad_b = std::make_shared<tensor::Tensor>(b->data->shape);
- // 补全下面的代码
- return {grad_a, grad_b};
- }
- }; // class SquareLoss
-
- std::shared_ptr<tensor::Tensor> log_softmax(std::shared_ptr<tensor::Tensor> logits);
-
- class SoftmaxLoss: public Loss {
- public:
- SoftmaxLoss(std::shared_ptr<Node> logits, std::shared_ptr<Node> labels): Loss(logits, labels) {
- this->data = this->forward();
- }
-
- std::shared_ptr<tensor::Tensor> forward() {
- // 我们已经帮你写好log_softmax
- auto log_probs = log_softmax(this->objects[0]->data);
- // 补全下面的代码,计算softmax loss
- std::vector<size_t> res_shape = {1};
- auto res = std::make_shared<tensor::Tensor>(res_shape);
- return res;
- }
- std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
- auto log_probs = log_softmax(this->objects[0]->data);
- auto labels = this->objects[1]->data;
- auto batch_size = log_probs->shape[0];
- auto num_classes = log_probs->shape[1];
- auto grad_logits = std::make_shared<tensor::Tensor>(log_probs->shape);
- auto grad_labels = std::make_shared<tensor::Tensor>(labels->shape);
- // 补全下面的代码
- return {grad_logits, grad_labels};
- }
- }; // class SoftmaxLoss
-
- std::vector<std::shared_ptr<tensor::Tensor>> gradients(std::shared_ptr<Loss> loss, std::vector<std::shared_ptr<Node>> parameters);
-
- }
|