You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

nn.h 14 kB

11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. #pragma once
  2. #include <vector>
  3. #include <memory>
  4. #include <unordered_set>
  5. #include <unordered_map>
  6. #include <algorithm>
  7. #include <pybind11/pybind11.h>
  8. #include <pybind11/numpy.h>
  9. #include <iostream>
  10. #include "../tensor/tensor.h"
  11. #include "../math/arith.h"
  12. namespace py = pybind11;
  13. namespace nn {
  14. class Node {
  15. public:
  16. std::shared_ptr<tensor::Tensor> data;
  17. std::vector<std::shared_ptr<Node>> objects;
  18. std::vector<std::shared_ptr<tensor::Tensor>> gradient;
  19. public:
  20. Node() {}
  21. virtual std::shared_ptr<tensor::Tensor> forward() = 0;
  22. virtual std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) = 0;
  23. std::vector<std::shared_ptr<Node>> get_parents() {
  24. return this->objects;
  25. }
  26. std::vector<float> get_data() {
  27. return this->data->data;
  28. }
  29. std::shared_ptr<tensor::Tensor> get_tensor() {
  30. return this->data;
  31. }
  32. // virtual void update(std::shared_ptr<tensor::Tensor> grad, float lr) = 0;
  33. // virtual void zero_grad() = 0;
  34. virtual ~Node() {}
  35. };
  36. class DataNode: public Node {
  37. public:
  38. DataNode() {}
  39. }; // class DataNode
  40. class Parameter: public DataNode {
  41. public:
  42. // Parameter(const std::vector<std::size_t>& shape) {
  43. // this->data = std::make_shared<tensor::Tensor>(shape, true);
  44. // }
  45. Parameter(py::array_t<float> array) {
  46. py::buffer_info info = array.request();
  47. float* dataPtr = static_cast<float*>(info.ptr);
  48. std::vector<std::size_t> shape = {};
  49. for (auto &it: info.shape) {
  50. shape.push_back(it);
  51. }
  52. auto tensor = std::make_shared<tensor::Tensor>(shape);
  53. std::vector<float> result(dataPtr, dataPtr + info.size);
  54. tensor->data = result;
  55. this->data = tensor;
  56. }
  57. std::shared_ptr<tensor::Tensor> forward() {
  58. return this->data;
  59. };
  60. std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) {
  61. return {gradient};
  62. };
  63. void update(std::shared_ptr<tensor::Tensor> grad, double lr) {
  64. for (auto i = 0; i < this->data->size; i++) {
  65. this->data->data[i] -= lr * grad->data[i];
  66. }
  67. }
  68. }; // class Parameter
  69. class Constant: public DataNode {
  70. public:
  71. Constant(std::shared_ptr<tensor::Tensor> data) {
  72. this->data = data;
  73. }
  74. Constant(py::array_t<float> array) {
  75. this->data = tensor::pyarray_to_tensor(array);
  76. }
  77. std::shared_ptr<tensor::Tensor> forward() {
  78. return this->data;
  79. };
  80. std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) {
  81. return {gradient};
  82. };
  83. // void update(std::shared_ptr<tensor::Tensor> grad, float lr) {}
  84. }; // class Constant
  85. class FunctionNode: public Node {
  86. public:
  87. FunctionNode(std::shared_ptr<Node> a, std::shared_ptr<Node> b) {
  88. this->objects.emplace_back(a);
  89. this->objects.emplace_back(b);
  90. }
  91. FunctionNode(std::shared_ptr<Node> a) {
  92. this->objects.emplace_back(a);
  93. }
  94. std::shared_ptr<tensor::Tensor> forward() override {
  95. return nullptr;
  96. }
  97. }; //class FunctionNode
  98. class Add: public FunctionNode {
  99. public:
  100. Add(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
  101. this->data = this->forward();
  102. }
  103. std::shared_ptr<tensor::Tensor> forward() override {
  104. auto a = this->objects[0];
  105. auto b = this->objects[1];
  106. auto outNode = std::make_shared<tensor::Tensor>(a->data->shape);
  107. for (auto i = 0; i < a->data->size; i++) {
  108. outNode->data[i] = a->data->data[i] + b->data->data[i];
  109. }
  110. return outNode;
  111. }
  112. std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
  113. // assertion needed
  114. return {gradient, gradient};
  115. }
  116. };
  117. class AddBias: public FunctionNode {
  118. public:
  119. AddBias(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
  120. this->data = this->forward();
  121. }
  122. std::shared_ptr<tensor::Tensor> forward() override {
  123. // features: a Node with shape (batch_size x num_features)
  124. // bias: a Node with shape (1 x num_features)
  125. auto features = this->objects[0];
  126. auto bias = this->objects[1];
  127. auto outNode = std::make_shared<tensor::Tensor>(features->data->shape);
  128. auto batch_size = features->data->shape[0];
  129. auto num_features = features->data->shape[1];
  130. for (size_t i = 0; i < batch_size; ++i) {
  131. for (size_t j = 0; j < num_features; ++j) {
  132. // 计算索引:batch_size行,num_features列的二维张量
  133. size_t idx = i * num_features + j;
  134. // 每个样本的特征向量加上偏置向量
  135. outNode->data[idx] = features->data->data[idx] + bias->data->data[j];
  136. }
  137. }
  138. // for循环写加法总会写吧🤔
  139. // 补全这里的代码
  140. return outNode;
  141. }
  142. std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
  143. // assertion needed
  144. auto g_bias = std::make_shared<tensor::Tensor>(this->objects[1]->data->shape);
  145. // 从张量形状获取维度信息
  146. auto batch_size = gradient->shape[0];
  147. auto num_features = gradient->shape[1]; // 从shape中获取num_features
  148. // 补全这里的代码
  149. // 初始化偏置梯度为零
  150. for (size_t j = 0; j < num_features; ++j)
  151. {
  152. g_bias->data[j] = 0.0f;
  153. }
  154. // 计算偏置的梯度:对每个特征维度,将所有样本的梯度累加
  155. for (size_t i = 0; i < batch_size; ++i) {
  156. for (size_t j = 0; j < num_features; ++j) {
  157. // 累加每个样本对该特征维度的梯度贡献
  158. g_bias->data[j] += gradient->data[i * num_features + j];
  159. }
  160. }
  161. return {gradient, g_bias};
  162. }
  163. std::vector<float> get_data() {
  164. return this->data->data;
  165. }
  166. }; // class AddBias
  167. class Linear: public FunctionNode {
  168. public:
  169. Linear(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
  170. this->data=this->forward();
  171. // 这段代码就一行,参考下别的类是怎么写的呢?
  172. // 在这里补全
  173. }
  174. std::shared_ptr<tensor::Tensor> forward() override {
  175. // features: (batch_size x input_features)
  176. auto features = this->objects[0];
  177. // weights: (input_features x output_features)
  178. auto weights = this->objects[1];
  179. auto m = features->data->shape[0];
  180. auto k = features->data->shape[1];
  181. auto n = weights->data->shape[1];
  182. // std::cout << m << " " << n << " " << k << std::endl;
  183. // output: (batch_size x output_features)
  184. auto shape = {m, n};
  185. auto outNode = std::make_shared<tensor::Tensor>(shape);
  186. // 实际上你需要补全的是arith::mm函数,快去找找它在哪里
  187. // 其余部分不需要动
  188. arith::mm(features->data->data, weights->data->data, outNode->data, m, k, n);
  189. return outNode;
  190. }
  191. // 辅助函数:矩阵转置
  192. template<typename T>
  193. std::vector<T> transpose(const std::vector<T>& mat, size_t rows, size_t cols) {
  194. std::vector<T> result(rows * cols);
  195. for (size_t i = 0; i < rows; ++i) {
  196. for (size_t j = 0; j < cols; ++j) {
  197. result[j * rows + i] = mat[i * cols + j];
  198. }
  199. }
  200. return result;
  201. }
  202. std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
  203. auto features = this->objects[0];
  204. auto weights = this->objects[1];
  205. // gradient.shape[0] == features.shape[0]
  206. // gradient.shape[1] == weights.shape[1]
  207. auto grad_features_shape = {gradient->shape[0], weights->data->shape[0]};
  208. auto grad_features = std::make_shared<tensor::Tensor>(grad_features_shape);
  209. auto grad_weights_shape = {features->data->shape[1], gradient->shape[1]};
  210. auto grad_weights = std::make_shared<tensor::Tensor>(grad_weights_shape);
  211. // 计算输入特征的梯度
  212. // grad_features = gradient * weights^T
  213. auto weights_transposed = transpose(weights->data->data, weights->data->shape[0], weights->data->shape[1]);
  214. size_t m = gradient->shape[0];
  215. size_t k = weights->data->shape[1];
  216. size_t n = weights->data->shape[0];
  217. arith::mm(gradient->data, weights_transposed, grad_features->data, m, k, n);
  218. // 计算权重的梯度
  219. // grad_weights = features^T * gradient
  220. auto features_transposed = transpose(features->data->data, features->data->shape[0], features->data->shape[1]);
  221. m = features->data->shape[1];
  222. k = features->data->shape[0];
  223. n = gradient->shape[1];
  224. arith::mm(features_transposed, gradient->data, grad_weights->data, m, k, n);
  225. return {grad_features, grad_weights};
  226. }
  227. }; //class Linear
  228. class ReLU: public FunctionNode {
  229. public:
  230. ReLU(std::shared_ptr<Node> a) : FunctionNode(a) {
  231. // 补全这里
  232. this->data = this->forward();
  233. }
  234. std::shared_ptr<tensor::Tensor> forward() override {
  235. // x: a Node with shape (batch_size x num_features)
  236. auto outNode = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape);
  237. // 补全这里,调用arith::vector_scalar_max
  238. arith::vector_scalar_max(this->objects[0]->data->data, outNode->data, 0.0f);
  239. return outNode;
  240. }
  241. std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
  242. auto grads = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape);
  243. // 补全这里,一个for循环
  244. for (size_t i = 0; i < grads->size; ++i) {
  245. if (this->objects[0]->data->data[i] > 0) {
  246. grads->data[i] = gradient->data[i];
  247. } else {
  248. grads->data[i] = 0;
  249. }
  250. }
  251. return {grads};
  252. }
  253. }; // class ReLU
  254. class Loss: public FunctionNode {
  255. public:
  256. bool used = false;
  257. public:
  258. Loss(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {}
  259. };
  260. class SquareLoss: public Loss {
  261. public:
  262. SquareLoss(std::shared_ptr<Node> a, std::shared_ptr<Node> b): Loss(a, b) {
  263. // 补全这里的代码
  264. this->data = this->forward();
  265. }
  266. std::shared_ptr<tensor::Tensor> forward() {
  267. // a: a Node with shape (batch_size x dim)
  268. // b: a Node with shape (batch_size x dim)
  269. // 这个简单,就是要注意返回的res需要是一个tensor就行
  270. auto a = this->objects[0]->data;
  271. auto b = this->objects[1]->data;
  272. float sum_squared_diff = 0.0f;
  273. for (size_t i = 0; i < a->size; ++i) {
  274. float diff = a->data[i] - b->data[i];
  275. sum_squared_diff += diff * diff;
  276. }
  277. // 修改下面的代码
  278. float square_loss = sum_squared_diff / a->size;
  279. std::vector<size_t> res_shape = {1};
  280. auto res = std::make_shared<tensor::Tensor>(res_shape);
  281. res->data[0] = square_loss;
  282. return res;
  283. }
  284. std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
  285. float g = gradient->data[0];
  286. auto a = this->objects[0];
  287. auto b = this->objects[1];
  288. auto grad_a = std::make_shared<tensor::Tensor>(a->data->shape);
  289. auto grad_b = std::make_shared<tensor::Tensor>(b->data->shape);
  290. // 补全下面的代码
  291. size_t n = a->data->size;
  292. for (size_t i = 0; i < n; ++i) {
  293. float diff = a->data->data[i] - b->data->data[i];
  294. grad_a->data[i] = g * (2.0f / n) * diff;
  295. grad_b->data[i] = -g * (2.0f / n) * diff;
  296. }
  297. return {grad_a, grad_b};
  298. }
  299. }; // class SquareLoss
  300. std::shared_ptr<tensor::Tensor> log_softmax(std::shared_ptr<tensor::Tensor> logits);
  301. class SoftmaxLoss: public Loss {
  302. public:
  303. SoftmaxLoss(std::shared_ptr<Node> logits, std::shared_ptr<Node> labels): Loss(logits, labels) {
  304. this->data = this->forward();
  305. }
  306. std::shared_ptr<tensor::Tensor> forward() {
  307. // 我们已经帮你写好log_softmax
  308. auto log_probs = log_softmax(this->objects[0]->data);
  309. // 补全下面的代码,计算softmax loss
  310. auto labels = this->objects[1]->data;
  311. // 样本数量
  312. auto batch_size = log_probs->shape[0];
  313. // 类别数量
  314. auto num_classes = log_probs->shape[1];
  315. // 初始化损失值
  316. float loss = 0.0f;
  317. // 计算 softmax 损失
  318. for (size_t i = 0; i < batch_size; ++i) {
  319. for (size_t j = 0; j < num_classes; ++j) {
  320. // 计算索引
  321. size_t idx = i * num_classes + j;
  322. // 累加损失
  323. loss += labels->data[idx] * log_probs->data[idx];
  324. }
  325. }
  326. // 求平均损失
  327. loss = -loss / batch_size;
  328. std::vector<size_t> res_shape = {1};
  329. auto res = std::make_shared<tensor::Tensor>(res_shape);
  330. res->data[0] = loss;
  331. return res;
  332. }
  333. std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
  334. auto log_probs = log_softmax(this->objects[0]->data);
  335. std::vector<float> probs(log_probs->data.size());
  336. for (size_t i = 0; i < log_probs->data.size(); ++i) {
  337. probs[i] = std::exp(log_probs->data[i]);
  338. }
  339. auto labels = this->objects[1]->data;
  340. auto batch_size = log_probs->shape[0];
  341. auto num_classes = log_probs->shape[1];
  342. auto grad_logits = std::make_shared<tensor::Tensor>(log_probs->shape);
  343. auto grad_labels = std::make_shared<tensor::Tensor>(labels->shape);
  344. // 补全下面的代码
  345. for (size_t i = 0; i < batch_size; ++i) {
  346. for (size_t j = 0; j < num_classes; ++j) {
  347. size_t idx = i * num_classes + j;
  348. // 根据公式计算梯度
  349. grad_logits->data[idx] = (probs[idx] - labels->data[idx]) / batch_size;
  350. }
  351. }
  352. std::fill(grad_labels->data.begin(), grad_labels->data.end(), 0.0f);
  353. return {grad_logits, grad_labels};
  354. }
  355. }; // class SoftmaxLoss
  356. std::vector<std::shared_ptr<tensor::Tensor>> gradients(std::shared_ptr<Loss> loss, std::vector<std::shared_ptr<Node>> parameters);
  357. }