| @@ -136,15 +136,46 @@ public: | |||
| auto bias = this->objects[1]; | |||
| auto outNode = std::make_shared<tensor::Tensor>(features->data->shape); | |||
| // for循环写加法总会写吧🤔 | |||
| size_t batch_size = features->data->shape[0]; | |||
| size_t num_features = features->data->shape[1]; | |||
| // 使用嵌套循环将 features 的每个元素与 bias 的对应元素相加 | |||
| for (size_t i = 0; i < batch_size; ++i) { | |||
| for (size_t j = 0; j < num_features; ++j) { | |||
| // 计算当前元素在一维向量中的索引 | |||
| size_t index = i * num_features + j; | |||
| outNode->data[index] = features->data->data[index] + bias->data->data[j]; | |||
| } | |||
| } | |||
| // 补全这里的代码 | |||
| return outNode; | |||
| } | |||
| std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { | |||
| // assertion needed | |||
| auto g_bias = std::make_shared<tensor::Tensor>(this->objects[1]->data->shape); | |||
| // 补全这里的代码 | |||
| return {gradient, g_bias}; | |||
| // 获取 features 和 bias 的信息 | |||
| auto features = this->objects[0]; | |||
| auto bias = this->objects[1]; | |||
| // 获取 batch_size 和 num_features | |||
| size_t batch_size = features->data->shape[0]; | |||
| size_t num_features = features->data->shape[1]; | |||
| // 计算 grad_features,直接复制 gradient | |||
| auto grad_features = std::make_shared<tensor::Tensor>(features->data->shape); | |||
| grad_features->data = gradient->data; | |||
| // 计算 grad_bias,将 gradient 每一列元素相加 | |||
| auto grad_bias = std::make_shared<tensor::Tensor>(bias->data->shape); | |||
| for (size_t j = 0; j < num_features; ++j) { | |||
| float column_sum = 0.0f; | |||
| for (size_t i = 0; i < batch_size; ++i) { | |||
| // 计算当前元素在一维向量中的索引 | |||
| size_t index = i * num_features + j; | |||
| column_sum += gradient->data[index]; | |||
| } | |||
| grad_bias->data[j] = column_sum; | |||
| } | |||
| return {grad_features, grad_bias}; | |||
| } | |||
| std::vector<float> get_data() { | |||
| return this->data->data; | |||
| @@ -194,17 +225,21 @@ class ReLU: public FunctionNode { | |||
| public: | |||
| ReLU(std::shared_ptr<Node> a) : FunctionNode(a) { | |||
| // 补全这里 | |||
| this->data = this->forward(); | |||
| } | |||
| std::shared_ptr<tensor::Tensor> forward() override { | |||
| // x: a Node with shape (batch_size x num_features) | |||
| auto outNode = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape); | |||
| // 补全这里,调用arith::vector_scalar_max | |||
| arith::vector_scalar_max(this->objects[0]->data->data, outNode->data, this->objects[0]->data->size, 0); | |||
| return outNode; | |||
| } | |||
| std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { | |||
| auto grads = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape); | |||
| // 补全这里,一个for循环 | |||
| for (size_t i = 0; i < this->objects[0]->data->size; ++i) { | |||
| grads->data[i] = (this->objects[0]->data->data[i] > 0) ? gradient->data[i] : 0; | |||
| } | |||
| return {grads}; | |||
| } | |||
| }; // class ReLU | |||
| @@ -220,14 +255,29 @@ class SquareLoss: public Loss { | |||
| public: | |||
| SquareLoss(std::shared_ptr<Node> a, std::shared_ptr<Node> b): Loss(a, b) { | |||
| // 补全这里的代码 | |||
| this->data = this->forward(); | |||
| } | |||
| std::shared_ptr<tensor::Tensor> forward() { | |||
| // a: a Node with shape (batch_size x dim) | |||
| // b: a Node with shape (batch_size x dim) | |||
| // 这个简单,就是要注意返回的res需要是一个tensor就行 | |||
| // 修改下面的代码 | |||
| auto a = this->objects[0]; | |||
| auto b = this->objects[1]; | |||
| float loss = 0.0f; | |||
| // 遍历所有元素,计算均方误差损失 | |||
| for (size_t i = 0; i < a->data->size; ++i) { | |||
| float diff = a->data->data[i] - b->data->data[i]; | |||
| loss += diff * diff; | |||
| } | |||
| // 除以 2 得到最终损失 | |||
| loss /= 2.0f; | |||
| std::vector<size_t> res_shape = {1}; | |||
| auto res = std::make_shared<tensor::Tensor>(res_shape); | |||
| res->data[0] = loss; | |||
| return res; | |||
| } | |||
| std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { | |||
| @@ -237,6 +287,16 @@ public: | |||
| auto grad_a = std::make_shared<tensor::Tensor>(a->data->shape); | |||
| auto grad_b = std::make_shared<tensor::Tensor>(b->data->shape); | |||
| // 补全下面的代码 | |||
| // 计算元素数量 | |||
| size_t size = a->data->size; | |||
| // 遍历所有元素,计算梯度 | |||
| for (size_t i = 0; i < size; ++i) { | |||
| float diff = a->data->data[i] - b->data->data[i]; | |||
| // 计算 grad_a 的第 i 个元素的梯度 | |||
| grad_a->data[i] = g * diff / size; | |||
| // 计算 grad_b 的第 i 个元素的梯度 | |||
| grad_b->data[i] = -g * diff / size; | |||
| } | |||
| return {grad_a, grad_b}; | |||
| } | |||
| }; // class SquareLoss | |||
| @@ -253,6 +313,22 @@ public: | |||
| // 我们已经帮你写好log_softmax | |||
| auto log_probs = log_softmax(this->objects[0]->data); | |||
| // 补全下面的代码,计算softmax loss | |||
| // 获取真实标签 | |||
| auto labels = this->objects[1]->data; | |||
| // 初始化损失值 | |||
| float loss = 0.0f; | |||
| // 遍历每个样本 | |||
| for (size_t i = 0; i < log_probs->shape[0]; ++i) { | |||
| // 遍历每个类别 | |||
| for (size_t j = 0; j < log_probs->shape[1]; ++j) { | |||
| // 计算当前样本当前类别的索引 | |||
| size_t index = i * log_probs->shape[1] + j; | |||
| // 累加损失值 | |||
| loss += -labels->data[index] * log_probs->data[index]; | |||
| } | |||
| } | |||
| // 计算平均损失 | |||
| loss /= log_probs->shape[0]; | |||
| std::vector<size_t> res_shape = {1}; | |||
| auto res = std::make_shared<tensor::Tensor>(res_shape); | |||
| return res; | |||
| @@ -264,7 +340,26 @@ public: | |||
| auto num_classes = log_probs->shape[1]; | |||
| auto grad_logits = std::make_shared<tensor::Tensor>(log_probs->shape); | |||
| auto grad_labels = std::make_shared<tensor::Tensor>(labels->shape); | |||
| // 补全下面的代码 | |||
| // 计算 softmax 概率,因为 log_probs 是 log_softmax 的结果,所以需要 exp 还原 | |||
| std::shared_ptr<tensor::Tensor> probs = std::make_shared<tensor::Tensor>(log_probs->shape); | |||
| for (size_t i = 0; i < log_probs->size; ++i) { | |||
| probs->data[i] = std::exp(log_probs->data[i]); | |||
| } | |||
| float g = gradient->data[0]; | |||
| // 计算 grad_logits | |||
| for (size_t i = 0; i < batch_size; ++i) { | |||
| for (size_t j = 0; j < num_classes; ++j) { | |||
| size_t index = i * num_classes + j; | |||
| // 计算梯度,公式为 softmax(logits)_i - y_true_i | |||
| grad_logits->data[index] = g * (probs->data[index] - labels->data[index]) / batch_size; | |||
| } | |||
| } | |||
| // grad_labels 通常不需要计算梯度,设为 0 | |||
| std::fill(grad_labels->data.begin(), grad_labels->data.end(), 0); | |||
| return {grad_logits, grad_labels}; | |||
| } | |||
| }; // class SoftmaxLoss | |||