| @@ -136,15 +136,46 @@ public: | |||||
| auto bias = this->objects[1]; | auto bias = this->objects[1]; | ||||
| auto outNode = std::make_shared<tensor::Tensor>(features->data->shape); | auto outNode = std::make_shared<tensor::Tensor>(features->data->shape); | ||||
| // for循环写加法总会写吧🤔 | // for循环写加法总会写吧🤔 | ||||
| size_t batch_size = features->data->shape[0]; | |||||
| size_t num_features = features->data->shape[1]; | |||||
| // 使用嵌套循环将 features 的每个元素与 bias 的对应元素相加 | |||||
| for (size_t i = 0; i < batch_size; ++i) { | |||||
| for (size_t j = 0; j < num_features; ++j) { | |||||
| // 计算当前元素在一维向量中的索引 | |||||
| size_t index = i * num_features + j; | |||||
| outNode->data[index] = features->data->data[index] + bias->data->data[j]; | |||||
| } | |||||
| } | |||||
| // 补全这里的代码 | // 补全这里的代码 | ||||
| return outNode; | return outNode; | ||||
| } | } | ||||
| std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { | std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { | ||||
| // assertion needed | |||||
| auto g_bias = std::make_shared<tensor::Tensor>(this->objects[1]->data->shape); | |||||
| // 补全这里的代码 | |||||
| return {gradient, g_bias}; | |||||
| // 获取 features 和 bias 的信息 | |||||
| auto features = this->objects[0]; | |||||
| auto bias = this->objects[1]; | |||||
| // 获取 batch_size 和 num_features | |||||
| size_t batch_size = features->data->shape[0]; | |||||
| size_t num_features = features->data->shape[1]; | |||||
| // 计算 grad_features,直接复制 gradient | |||||
| auto grad_features = std::make_shared<tensor::Tensor>(features->data->shape); | |||||
| grad_features->data = gradient->data; | |||||
| // 计算 grad_bias,将 gradient 每一列元素相加 | |||||
| auto grad_bias = std::make_shared<tensor::Tensor>(bias->data->shape); | |||||
| for (size_t j = 0; j < num_features; ++j) { | |||||
| float column_sum = 0.0f; | |||||
| for (size_t i = 0; i < batch_size; ++i) { | |||||
| // 计算当前元素在一维向量中的索引 | |||||
| size_t index = i * num_features + j; | |||||
| column_sum += gradient->data[index]; | |||||
| } | |||||
| grad_bias->data[j] = column_sum; | |||||
| } | |||||
| return {grad_features, grad_bias}; | |||||
| } | } | ||||
| std::vector<float> get_data() { | std::vector<float> get_data() { | ||||
| return this->data->data; | return this->data->data; | ||||
| @@ -194,17 +225,21 @@ class ReLU: public FunctionNode { | |||||
| public: | public: | ||||
| ReLU(std::shared_ptr<Node> a) : FunctionNode(a) { | ReLU(std::shared_ptr<Node> a) : FunctionNode(a) { | ||||
| // 补全这里 | // 补全这里 | ||||
| this->data = this->forward(); | |||||
| } | } | ||||
| std::shared_ptr<tensor::Tensor> forward() override { | std::shared_ptr<tensor::Tensor> forward() override { | ||||
| // x: a Node with shape (batch_size x num_features) | // x: a Node with shape (batch_size x num_features) | ||||
| auto outNode = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape); | auto outNode = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape); | ||||
| // 补全这里,调用arith::vector_scalar_max | // 补全这里,调用arith::vector_scalar_max | ||||
| arith::vector_scalar_max(this->objects[0]->data->data, outNode->data, this->objects[0]->data->size, 0); | |||||
| return outNode; | return outNode; | ||||
| } | } | ||||
| std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { | std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { | ||||
| auto grads = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape); | auto grads = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape); | ||||
| // 补全这里,一个for循环 | // 补全这里,一个for循环 | ||||
| for (size_t i = 0; i < this->objects[0]->data->size; ++i) { | |||||
| grads->data[i] = (this->objects[0]->data->data[i] > 0) ? gradient->data[i] : 0; | |||||
| } | |||||
| return {grads}; | return {grads}; | ||||
| } | } | ||||
| }; // class ReLU | }; // class ReLU | ||||
| @@ -220,14 +255,29 @@ class SquareLoss: public Loss { | |||||
| public: | public: | ||||
| SquareLoss(std::shared_ptr<Node> a, std::shared_ptr<Node> b): Loss(a, b) { | SquareLoss(std::shared_ptr<Node> a, std::shared_ptr<Node> b): Loss(a, b) { | ||||
| // 补全这里的代码 | // 补全这里的代码 | ||||
| this->data = this->forward(); | |||||
| } | } | ||||
| std::shared_ptr<tensor::Tensor> forward() { | std::shared_ptr<tensor::Tensor> forward() { | ||||
| // a: a Node with shape (batch_size x dim) | // a: a Node with shape (batch_size x dim) | ||||
| // b: a Node with shape (batch_size x dim) | // b: a Node with shape (batch_size x dim) | ||||
| // 这个简单,就是要注意返回的res需要是一个tensor就行 | // 这个简单,就是要注意返回的res需要是一个tensor就行 | ||||
| // 修改下面的代码 | // 修改下面的代码 | ||||
| auto a = this->objects[0]; | |||||
| auto b = this->objects[1]; | |||||
| float loss = 0.0f; | |||||
| // 遍历所有元素,计算均方误差损失 | |||||
| for (size_t i = 0; i < a->data->size; ++i) { | |||||
| float diff = a->data->data[i] - b->data->data[i]; | |||||
| loss += diff * diff; | |||||
| } | |||||
| // 除以 2 得到最终损失 | |||||
| loss /= 2.0f; | |||||
| std::vector<size_t> res_shape = {1}; | std::vector<size_t> res_shape = {1}; | ||||
| auto res = std::make_shared<tensor::Tensor>(res_shape); | auto res = std::make_shared<tensor::Tensor>(res_shape); | ||||
| res->data[0] = loss; | |||||
| return res; | return res; | ||||
| } | } | ||||
| std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { | std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { | ||||
| @@ -237,6 +287,16 @@ public: | |||||
| auto grad_a = std::make_shared<tensor::Tensor>(a->data->shape); | auto grad_a = std::make_shared<tensor::Tensor>(a->data->shape); | ||||
| auto grad_b = std::make_shared<tensor::Tensor>(b->data->shape); | auto grad_b = std::make_shared<tensor::Tensor>(b->data->shape); | ||||
| // 补全下面的代码 | // 补全下面的代码 | ||||
| // 计算元素数量 | |||||
| size_t size = a->data->size; | |||||
| // 遍历所有元素,计算梯度 | |||||
| for (size_t i = 0; i < size; ++i) { | |||||
| float diff = a->data->data[i] - b->data->data[i]; | |||||
| // 计算 grad_a 的第 i 个元素的梯度 | |||||
| grad_a->data[i] = g * diff / size; | |||||
| // 计算 grad_b 的第 i 个元素的梯度 | |||||
| grad_b->data[i] = -g * diff / size; | |||||
| } | |||||
| return {grad_a, grad_b}; | return {grad_a, grad_b}; | ||||
| } | } | ||||
| }; // class SquareLoss | }; // class SquareLoss | ||||
| @@ -253,6 +313,22 @@ public: | |||||
| // 我们已经帮你写好log_softmax | // 我们已经帮你写好log_softmax | ||||
| auto log_probs = log_softmax(this->objects[0]->data); | auto log_probs = log_softmax(this->objects[0]->data); | ||||
| // 补全下面的代码,计算softmax loss | // 补全下面的代码,计算softmax loss | ||||
| // 获取真实标签 | |||||
| auto labels = this->objects[1]->data; | |||||
| // 初始化损失值 | |||||
| float loss = 0.0f; | |||||
| // 遍历每个样本 | |||||
| for (size_t i = 0; i < log_probs->shape[0]; ++i) { | |||||
| // 遍历每个类别 | |||||
| for (size_t j = 0; j < log_probs->shape[1]; ++j) { | |||||
| // 计算当前样本当前类别的索引 | |||||
| size_t index = i * log_probs->shape[1] + j; | |||||
| // 累加损失值 | |||||
| loss += -labels->data[index] * log_probs->data[index]; | |||||
| } | |||||
| } | |||||
| // 计算平均损失 | |||||
| loss /= log_probs->shape[0]; | |||||
| std::vector<size_t> res_shape = {1}; | std::vector<size_t> res_shape = {1}; | ||||
| auto res = std::make_shared<tensor::Tensor>(res_shape); | auto res = std::make_shared<tensor::Tensor>(res_shape); | ||||
| return res; | return res; | ||||
| @@ -264,7 +340,26 @@ public: | |||||
| auto num_classes = log_probs->shape[1]; | auto num_classes = log_probs->shape[1]; | ||||
| auto grad_logits = std::make_shared<tensor::Tensor>(log_probs->shape); | auto grad_logits = std::make_shared<tensor::Tensor>(log_probs->shape); | ||||
| auto grad_labels = std::make_shared<tensor::Tensor>(labels->shape); | auto grad_labels = std::make_shared<tensor::Tensor>(labels->shape); | ||||
| // 补全下面的代码 | |||||
| // 计算 softmax 概率,因为 log_probs 是 log_softmax 的结果,所以需要 exp 还原 | |||||
| std::shared_ptr<tensor::Tensor> probs = std::make_shared<tensor::Tensor>(log_probs->shape); | |||||
| for (size_t i = 0; i < log_probs->size; ++i) { | |||||
| probs->data[i] = std::exp(log_probs->data[i]); | |||||
| } | |||||
| float g = gradient->data[0]; | |||||
| // 计算 grad_logits | |||||
| for (size_t i = 0; i < batch_size; ++i) { | |||||
| for (size_t j = 0; j < num_classes; ++j) { | |||||
| size_t index = i * num_classes + j; | |||||
| // 计算梯度,公式为 softmax(logits)_i - y_true_i | |||||
| grad_logits->data[index] = g * (probs->data[index] - labels->data[index]) / batch_size; | |||||
| } | |||||
| } | |||||
| // grad_labels 通常不需要计算梯度,设为 0 | |||||
| std::fill(grad_labels->data.begin(), grad_labels->data.end(), 0); | |||||
| return {grad_logits, grad_labels}; | return {grad_logits, grad_labels}; | ||||
| } | } | ||||
| }; // class SoftmaxLoss | }; // class SoftmaxLoss | ||||