diff --git a/cc/math/arith.h b/cc/math/arith.h index ffdfec4..e8adadd 100644 --- a/cc/math/arith.h +++ b/cc/math/arith.h @@ -11,10 +11,26 @@ float mean(const std::vector& x); template void mm(const std::vector& a, const std::vector& b, std::vector& c, size_t m, size_t k, size_t n) { // 补全这里,谢谢 + c.resize(m * n); + + std::fill(c.begin(), c.end(), static_cast(0)); + + for (size_t i = 0; i < m; ++i) { + for (size_t j = 0; j < n; ++j) { + for (size_t p = 0; p < k; ++p) { + c[i * n + j] += a[i * k + p] * b[p * n + j]; + } + } + } } template void vector_scalar_max(const std::vector& a, std::vector &b, T scalar) { // 补全这里,谢谢 + b.resize(a.size()); + + for (size_t i = 0; i < a.size(); ++i) { + b[i] = (a[i] > scalar) ? a[i] : scalar; + } } } \ No newline at end of file diff --git a/cc/operators/nn.h b/cc/operators/nn.h index ec07c1c..dd1b9e4 100644 --- a/cc/operators/nn.h +++ b/cc/operators/nn.h @@ -205,6 +205,18 @@ public: return outNode; } + // 辅助函数:矩阵转置 + template + std::vector transpose(const std::vector& mat, size_t rows, size_t cols) { + std::vector result(rows * cols); + for (size_t i = 0; i < rows; ++i) { + for (size_t j = 0; j < cols; ++j) { + result[j * rows + i] = mat[i * cols + j]; + } + } + return result; + } + std::vector> backward(std::shared_ptr gradient) override { auto features = this->objects[0]; auto weights = this->objects[1]; @@ -214,7 +226,23 @@ public: auto grad_features = std::make_shared(grad_features_shape); auto grad_weights_shape = {features->data->shape[1], gradient->shape[1]}; auto grad_weights = std::make_shared(grad_weights_shape); - // 这里要调用两次arith:mm,是分别把哪两个矩阵相乘呢? + + // 计算输入特征的梯度 + // grad_features = gradient * weights^T + auto weights_transposed = transpose(weights->data->data, weights->data->shape[0], weights->data->shape[1]); + size_t m = gradient->shape[0]; + size_t k = weights->data->shape[1]; + size_t n = weights->data->shape[0]; + arith::mm(gradient->data, weights_transposed, grad_features->data, m, k, n); + + // 计算权重的梯度 + // grad_weights = features^T * gradient + auto features_transposed = transpose(features->data->data, features->data->shape[0], features->data->shape[1]); + m = features->data->shape[1]; + k = features->data->shape[0]; + n = gradient->shape[1]; + arith::mm(features_transposed, gradient->data, grad_weights->data, m, k, n); + return {grad_features, grad_weights}; } }; //class Linear @@ -223,17 +251,25 @@ class ReLU: public FunctionNode { public: ReLU(std::shared_ptr a) : FunctionNode(a) { // 补全这里 + this->data = this->forward(); } std::shared_ptr forward() override { // x: a Node with shape (batch_size x num_features) auto outNode = std::make_shared(this->objects[0]->data->shape); // 补全这里,调用arith::vector_scalar_max + arith::vector_scalar_max(this->objects[0]->data->data, outNode->data, 0.0f); return outNode; } std::vector> backward(std::shared_ptr gradient) override { auto grads = std::make_shared(this->objects[0]->data->shape); // 补全这里,一个for循环 - + for (size_t i = 0; i < grads->size; ++i) { + if (this->objects[0]->data->data[i] > 0) { + grads->data[i] = gradient->data[i]; + } else { + grads->data[i] = 0; + } + } return {grads}; } }; // class ReLU @@ -249,14 +285,24 @@ class SquareLoss: public Loss { public: SquareLoss(std::shared_ptr a, std::shared_ptr b): Loss(a, b) { // 补全这里的代码 + this->data = this->forward(); } std::shared_ptr forward() { // a: a Node with shape (batch_size x dim) // b: a Node with shape (batch_size x dim) // 这个简单,就是要注意返回的res需要是一个tensor就行 + auto a = this->objects[0]->data; + auto b = this->objects[1]->data; + float sum_squared_diff = 0.0f; + for (size_t i = 0; i < a->size; ++i) { + float diff = a->data[i] - b->data[i]; + sum_squared_diff += diff * diff; + } // 修改下面的代码 + float square_loss = sum_squared_diff / a->size; std::vector res_shape = {1}; auto res = std::make_shared(res_shape); + res->data[0] = square_loss; return res; } std::vector> backward(std::shared_ptr gradient) override { @@ -266,6 +312,12 @@ public: auto grad_a = std::make_shared(a->data->shape); auto grad_b = std::make_shared(b->data->shape); // 补全下面的代码 + size_t n = a->data->size; + for (size_t i = 0; i < n; ++i) { + float diff = a->data->data[i] - b->data->data[i]; + grad_a->data[i] = g * (2.0f / n) * diff; + grad_b->data[i] = -g * (2.0f / n) * diff; + } return {grad_a, grad_b}; } }; // class SquareLoss @@ -282,18 +334,52 @@ public: // 我们已经帮你写好log_softmax auto log_probs = log_softmax(this->objects[0]->data); // 补全下面的代码,计算softmax loss + auto labels = this->objects[1]->data; + // 样本数量 + auto batch_size = log_probs->shape[0]; + // 类别数量 + auto num_classes = log_probs->shape[1]; + // 初始化损失值 + float loss = 0.0f; + + // 计算 softmax 损失 + for (size_t i = 0; i < batch_size; ++i) { + for (size_t j = 0; j < num_classes; ++j) { + // 计算索引 + size_t idx = i * num_classes + j; + // 累加损失 + loss += labels->data[idx] * log_probs->data[idx]; + } + } + // 求平均损失 + loss = -loss / batch_size; + std::vector res_shape = {1}; auto res = std::make_shared(res_shape); + res->data[0] = loss; return res; } std::vector> backward(std::shared_ptr gradient) override { auto log_probs = log_softmax(this->objects[0]->data); + std::vector probs(log_probs->data.size()); + for (size_t i = 0; i < log_probs->data.size(); ++i) { + probs[i] = std::exp(log_probs->data[i]); + } auto labels = this->objects[1]->data; auto batch_size = log_probs->shape[0]; auto num_classes = log_probs->shape[1]; auto grad_logits = std::make_shared(log_probs->shape); auto grad_labels = std::make_shared(labels->shape); // 补全下面的代码 + for (size_t i = 0; i < batch_size; ++i) { + for (size_t j = 0; j < num_classes; ++j) { + size_t idx = i * num_classes + j; + // 根据公式计算梯度 + grad_logits->data[idx] = (probs[idx] - labels->data[idx]) / batch_size; + } + } + + std::fill(grad_labels->data.begin(), grad_labels->data.end(), 0.0f); return {grad_logits, grad_labels}; } }; // class SoftmaxLoss