You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

nn.h 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. #pragma once
  2. #include <vector>
  3. #include <memory>
  4. #include <unordered_set>
  5. #include <unordered_map>
  6. #include <algorithm>
  7. #include <pybind11/pybind11.h>
  8. #include <pybind11/numpy.h>
  9. #include <iostream>
  10. #include "../tensor/tensor.h"
  11. #include "../math/arith.h"
  12. namespace py = pybind11;
  13. namespace nn {
  14. class Node {
  15. public:
  16. std::shared_ptr<tensor::Tensor> data;
  17. std::vector<std::shared_ptr<Node>> objects;
  18. std::vector<std::shared_ptr<tensor::Tensor>> gradient;
  19. public:
  20. Node() {}
  21. virtual std::shared_ptr<tensor::Tensor> forward() = 0;
  22. virtual std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) = 0;
  23. std::vector<std::shared_ptr<Node>> get_parents() {
  24. return this->objects;
  25. }
  26. std::vector<float> get_data() {
  27. return this->data->data;
  28. }
  29. std::shared_ptr<tensor::Tensor> get_tensor() {
  30. return this->data;
  31. }
  32. // virtual void update(std::shared_ptr<tensor::Tensor> grad, float lr) = 0;
  33. // virtual void zero_grad() = 0;
  34. virtual ~Node() {}
  35. };
  36. class DataNode: public Node {
  37. public:
  38. DataNode() {}
  39. }; // class DataNode
  40. class Parameter: public DataNode {
  41. public:
  42. // Parameter(const std::vector<std::size_t>& shape) {
  43. // this->data = std::make_shared<tensor::Tensor>(shape, true);
  44. // }
  45. Parameter(py::array_t<float> array) {
  46. py::buffer_info info = array.request();
  47. float* dataPtr = static_cast<float*>(info.ptr);
  48. std::vector<std::size_t> shape = {};
  49. for (auto &it: info.shape) {
  50. shape.push_back(it);
  51. }
  52. auto tensor = std::make_shared<tensor::Tensor>(shape);
  53. std::vector<float> result(dataPtr, dataPtr + info.size);
  54. tensor->data = result;
  55. this->data = tensor;
  56. }
  57. std::shared_ptr<tensor::Tensor> forward() {
  58. return this->data;
  59. };
  60. std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) {
  61. return {gradient};
  62. };
  63. void update(std::shared_ptr<tensor::Tensor> grad, double lr) {
  64. for (auto i = 0; i < this->data->size; i++) {
  65. this->data->data[i] -= lr * grad->data[i];
  66. }
  67. }
  68. }; // class Parameter
  69. class Constant: public DataNode {
  70. public:
  71. Constant(std::shared_ptr<tensor::Tensor> data) {
  72. this->data = data;
  73. }
  74. Constant(py::array_t<float> array) {
  75. this->data = tensor::pyarray_to_tensor(array);
  76. }
  77. std::shared_ptr<tensor::Tensor> forward() {
  78. return this->data;
  79. };
  80. std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) {
  81. return {gradient};
  82. };
  83. // void update(std::shared_ptr<tensor::Tensor> grad, float lr) {}
  84. }; // class Constant
  85. class FunctionNode: public Node {
  86. public:
  87. FunctionNode(std::shared_ptr<Node> a, std::shared_ptr<Node> b) {
  88. this->objects.emplace_back(a);
  89. this->objects.emplace_back(b);
  90. }
  91. FunctionNode(std::shared_ptr<Node> a) {
  92. this->objects.emplace_back(a);
  93. }
  94. std::shared_ptr<tensor::Tensor> forward() override {
  95. return nullptr;
  96. }
  97. }; //class FunctionNode
  98. class Add: public FunctionNode {
  99. public:
  100. Add(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
  101. this->data = this->forward();
  102. }
  103. std::shared_ptr<tensor::Tensor> forward() override {
  104. auto a = this->objects[0];
  105. auto b = this->objects[1];
  106. auto outNode = std::make_shared<tensor::Tensor>(a->data->shape);
  107. for (auto i = 0; i < a->data->size; i++) {
  108. outNode->data[i] = a->data->data[i] + b->data->data[i];
  109. }
  110. return outNode;
  111. }
  112. std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
  113. // assertion needed
  114. return {gradient, gradient};
  115. }
  116. };
  117. class AddBias: public FunctionNode {
  118. public:
  119. AddBias(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
  120. this->data = this->forward();
  121. }
  122. std::shared_ptr<tensor::Tensor> forward() override {
  123. // features: a Node with shape (batch_size x num_features)
  124. // bias: a Node with shape (1 x num_features)
  125. auto features = this->objects[0];
  126. auto bias = this->objects[1];
  127. auto outNode = std::make_shared<tensor::Tensor>(features->data->shape);
  128. auto batch_size = features->data->shape[0];
  129. auto num_features = features->data->shape[1];
  130. for (size_t i = 0; i < batch_size; ++i) {
  131. for (size_t j = 0; j < num_features; ++j) {
  132. // 计算索引:batch_size行,num_features列的二维张量
  133. size_t idx = i * num_features + j;
  134. // 每个样本的特征向量加上偏置向量
  135. outNode->data[idx] = features->data->data[idx] + bias->data->data[j];
  136. }
  137. }
  138. // for循环写加法总会写吧🤔
  139. // 补全这里的代码
  140. return outNode;
  141. }
  142. std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
  143. // assertion needed
  144. auto g_bias = std::make_shared<tensor::Tensor>(this->objects[1]->data->shape);
  145. // 从张量形状获取维度信息
  146. auto batch_size = gradient->shape[0];
  147. auto num_features = gradient->shape[1]; // 从shape中获取num_features
  148. // 补全这里的代码
  149. auto batch_size = gradient->shape[0];
  150. auto num_features = gradient->shape[1];
  151. // 初始化偏置梯度为零
  152. for (size_t j = 0; j < num_features; ++j)
  153. {
  154. g_bias->data[j] = 0.0f;
  155. }
  156. // 计算偏置的梯度:对每个特征维度,将所有样本的梯度累加
  157. for (size_t i = 0; i < batch_size; ++i) {
  158. for (size_t j = 0; j < num_features; ++j) {
  159. // 累加每个样本对该特征维度的梯度贡献
  160. g_bias->data[j] += gradient->data[i * num_features + j];
  161. }
  162. }
  163. return {gradient, g_bias};
  164. }
  165. std::vector<float> get_data() {
  166. return this->data->data;
  167. }
  168. }; // class AddBias
  169. class Linear: public FunctionNode {
  170. public:
  171. Linear(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
  172. this->data=this->forward();
  173. // 这段代码就一行,参考下别的类是怎么写的呢?
  174. // 在这里补全
  175. }
  176. std::shared_ptr<tensor::Tensor> forward() override {
  177. // features: (batch_size x input_features)
  178. auto features = this->objects[0];
  179. // weights: (input_features x output_features)
  180. auto weights = this->objects[1];
  181. auto m = features->data->shape[0];
  182. auto k = features->data->shape[1];
  183. auto n = weights->data->shape[1];
  184. // std::cout << m << " " << n << " " << k << std::endl;
  185. // output: (batch_size x output_features)
  186. auto shape = {m, n};
  187. auto outNode = std::make_shared<tensor::Tensor>(shape);
  188. // 实际上你需要补全的是arith::mm函数,快去找找它在哪里
  189. // 其余部分不需要动
  190. arith::mm(features->data->data, weights->data->data, outNode->data, m, k, n);
  191. return outNode;
  192. }
  193. std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
  194. auto features = this->objects[0];
  195. auto weights = this->objects[1];
  196. // gradient.shape[0] == features.shape[0]
  197. // gradient.shape[1] == weights.shape[1]
  198. auto grad_features_shape = {gradient->shape[0], weights->data->shape[0]};
  199. auto grad_features = std::make_shared<tensor::Tensor>(grad_features_shape);
  200. auto grad_weights_shape = {features->data->shape[1], gradient->shape[1]};
  201. auto grad_weights = std::make_shared<tensor::Tensor>(grad_weights_shape);
  202. // 这里要调用两次arith:mm,是分别把哪两个矩阵相乘呢?
  203. return {grad_features, grad_weights};
  204. }
  205. }; //class Linear
  206. class ReLU: public FunctionNode {
  207. public:
  208. ReLU(std::shared_ptr<Node> a) : FunctionNode(a) {
  209. // 补全这里
  210. }
  211. std::shared_ptr<tensor::Tensor> forward() override {
  212. // x: a Node with shape (batch_size x num_features)
  213. auto outNode = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape);
  214. // 补全这里,调用arith::vector_scalar_max
  215. return outNode;
  216. }
  217. std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
  218. auto grads = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape);
  219. // 补全这里,一个for循环
  220. return {grads};
  221. }
  222. }; // class ReLU
  223. class Loss: public FunctionNode {
  224. public:
  225. bool used = false;
  226. public:
  227. Loss(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {}
  228. };
  229. class SquareLoss: public Loss {
  230. public:
  231. SquareLoss(std::shared_ptr<Node> a, std::shared_ptr<Node> b): Loss(a, b) {
  232. // 补全这里的代码
  233. }
  234. std::shared_ptr<tensor::Tensor> forward() {
  235. // a: a Node with shape (batch_size x dim)
  236. // b: a Node with shape (batch_size x dim)
  237. // 这个简单,就是要注意返回的res需要是一个tensor就行
  238. // 修改下面的代码
  239. std::vector<size_t> res_shape = {1};
  240. auto res = std::make_shared<tensor::Tensor>(res_shape);
  241. return res;
  242. }
  243. std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
  244. float g = gradient->data[0];
  245. auto a = this->objects[0];
  246. auto b = this->objects[1];
  247. auto grad_a = std::make_shared<tensor::Tensor>(a->data->shape);
  248. auto grad_b = std::make_shared<tensor::Tensor>(b->data->shape);
  249. // 补全下面的代码
  250. return {grad_a, grad_b};
  251. }
  252. }; // class SquareLoss
  253. std::shared_ptr<tensor::Tensor> log_softmax(std::shared_ptr<tensor::Tensor> logits);
  254. class SoftmaxLoss: public Loss {
  255. public:
  256. SoftmaxLoss(std::shared_ptr<Node> logits, std::shared_ptr<Node> labels): Loss(logits, labels) {
  257. this->data = this->forward();
  258. }
  259. std::shared_ptr<tensor::Tensor> forward() {
  260. // 我们已经帮你写好log_softmax
  261. auto log_probs = log_softmax(this->objects[0]->data);
  262. // 补全下面的代码,计算softmax loss
  263. std::vector<size_t> res_shape = {1};
  264. auto res = std::make_shared<tensor::Tensor>(res_shape);
  265. return res;
  266. }
  267. std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
  268. auto log_probs = log_softmax(this->objects[0]->data);
  269. auto labels = this->objects[1]->data;
  270. auto batch_size = log_probs->shape[0];
  271. auto num_classes = log_probs->shape[1];
  272. auto grad_logits = std::make_shared<tensor::Tensor>(log_probs->shape);
  273. auto grad_labels = std::make_shared<tensor::Tensor>(labels->shape);
  274. // 补全下面的代码
  275. return {grad_logits, grad_labels};
  276. }
  277. }; // class SoftmaxLoss
  278. std::vector<std::shared_ptr<tensor::Tensor>> gradients(std::shared_ptr<Loss> loss, std::vector<std::shared_ptr<Node>> parameters);
  279. }