diff --git a/cc/operators/nn.h b/cc/operators/nn.h index e4824e8..ead58cc 100644 --- a/cc/operators/nn.h +++ b/cc/operators/nn.h @@ -136,15 +136,46 @@ public: auto bias = this->objects[1]; auto outNode = std::make_shared(features->data->shape); // for循环写加法总会写吧🤔 + size_t batch_size = features->data->shape[0]; + size_t num_features = features->data->shape[1]; + + // 使用嵌套循环将 features 的每个元素与 bias 的对应元素相加 + for (size_t i = 0; i < batch_size; ++i) { + for (size_t j = 0; j < num_features; ++j) { + // 计算当前元素在一维向量中的索引 + size_t index = i * num_features + j; + outNode->data[index] = features->data->data[index] + bias->data->data[j]; + } + } // 补全这里的代码 return outNode; } std::vector> backward(std::shared_ptr gradient) override { - // assertion needed - auto g_bias = std::make_shared(this->objects[1]->data->shape); - // 补全这里的代码 - - return {gradient, g_bias}; + // 获取 features 和 bias 的信息 + auto features = this->objects[0]; + auto bias = this->objects[1]; + + // 获取 batch_size 和 num_features + size_t batch_size = features->data->shape[0]; + size_t num_features = features->data->shape[1]; + + // 计算 grad_features,直接复制 gradient + auto grad_features = std::make_shared(features->data->shape); + grad_features->data = gradient->data; + + // 计算 grad_bias,将 gradient 每一列元素相加 + auto grad_bias = std::make_shared(bias->data->shape); + for (size_t j = 0; j < num_features; ++j) { + float column_sum = 0.0f; + for (size_t i = 0; i < batch_size; ++i) { + // 计算当前元素在一维向量中的索引 + size_t index = i * num_features + j; + column_sum += gradient->data[index]; + } + grad_bias->data[j] = column_sum; + } + + return {grad_features, grad_bias}; } std::vector get_data() { return this->data->data; @@ -194,17 +225,21 @@ class ReLU: public FunctionNode { public: ReLU(std::shared_ptr a) : FunctionNode(a) { // 补全这里 + this->data = this->forward(); } std::shared_ptr forward() override { // x: a Node with shape (batch_size x num_features) auto outNode = std::make_shared(this->objects[0]->data->shape); // 补全这里,调用arith::vector_scalar_max + arith::vector_scalar_max(this->objects[0]->data->data, outNode->data, this->objects[0]->data->size, 0); return outNode; } std::vector> backward(std::shared_ptr gradient) override { auto grads = std::make_shared(this->objects[0]->data->shape); // 补全这里,一个for循环 - + for (size_t i = 0; i < this->objects[0]->data->size; ++i) { + grads->data[i] = (this->objects[0]->data->data[i] > 0) ? gradient->data[i] : 0; + } return {grads}; } }; // class ReLU @@ -220,14 +255,29 @@ class SquareLoss: public Loss { public: SquareLoss(std::shared_ptr a, std::shared_ptr b): Loss(a, b) { // 补全这里的代码 + this->data = this->forward(); } std::shared_ptr forward() { // a: a Node with shape (batch_size x dim) // b: a Node with shape (batch_size x dim) // 这个简单,就是要注意返回的res需要是一个tensor就行 // 修改下面的代码 + auto a = this->objects[0]; + auto b = this->objects[1]; + float loss = 0.0f; + + // 遍历所有元素,计算均方误差损失 + for (size_t i = 0; i < a->data->size; ++i) { + float diff = a->data->data[i] - b->data->data[i]; + loss += diff * diff; + } + + // 除以 2 得到最终损失 + loss /= 2.0f; + std::vector res_shape = {1}; auto res = std::make_shared(res_shape); + res->data[0] = loss; return res; } std::vector> backward(std::shared_ptr gradient) override { @@ -237,6 +287,16 @@ public: auto grad_a = std::make_shared(a->data->shape); auto grad_b = std::make_shared(b->data->shape); // 补全下面的代码 + // 计算元素数量 + size_t size = a->data->size; + // 遍历所有元素,计算梯度 + for (size_t i = 0; i < size; ++i) { + float diff = a->data->data[i] - b->data->data[i]; + // 计算 grad_a 的第 i 个元素的梯度 + grad_a->data[i] = g * diff / size; + // 计算 grad_b 的第 i 个元素的梯度 + grad_b->data[i] = -g * diff / size; + } return {grad_a, grad_b}; } }; // class SquareLoss @@ -253,6 +313,22 @@ public: // 我们已经帮你写好log_softmax auto log_probs = log_softmax(this->objects[0]->data); // 补全下面的代码,计算softmax loss + // 获取真实标签 + auto labels = this->objects[1]->data; + // 初始化损失值 + float loss = 0.0f; + // 遍历每个样本 + for (size_t i = 0; i < log_probs->shape[0]; ++i) { + // 遍历每个类别 + for (size_t j = 0; j < log_probs->shape[1]; ++j) { + // 计算当前样本当前类别的索引 + size_t index = i * log_probs->shape[1] + j; + // 累加损失值 + loss += -labels->data[index] * log_probs->data[index]; + } + } + // 计算平均损失 + loss /= log_probs->shape[0]; std::vector res_shape = {1}; auto res = std::make_shared(res_shape); return res; @@ -264,7 +340,26 @@ public: auto num_classes = log_probs->shape[1]; auto grad_logits = std::make_shared(log_probs->shape); auto grad_labels = std::make_shared(labels->shape); - // 补全下面的代码 + + // 计算 softmax 概率,因为 log_probs 是 log_softmax 的结果,所以需要 exp 还原 + std::shared_ptr probs = std::make_shared(log_probs->shape); + for (size_t i = 0; i < log_probs->size; ++i) { + probs->data[i] = std::exp(log_probs->data[i]); + } + + float g = gradient->data[0]; + // 计算 grad_logits + for (size_t i = 0; i < batch_size; ++i) { + for (size_t j = 0; j < num_classes; ++j) { + size_t index = i * num_classes + j; + // 计算梯度,公式为 softmax(logits)_i - y_true_i + grad_logits->data[index] = g * (probs->data[index] - labels->data[index]) / batch_size; + } + } + + // grad_labels 通常不需要计算梯度,设为 0 + std::fill(grad_labels->data.begin(), grad_labels->data.end(), 0); + return {grad_logits, grad_labels}; } }; // class SoftmaxLoss