|
|
@@ -205,6 +205,18 @@ public: |
|
|
return outNode; |
|
|
return outNode; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// 辅助函数:矩阵转置 |
|
|
|
|
|
template<typename T> |
|
|
|
|
|
std::vector<T> transpose(const std::vector<T>& mat, size_t rows, size_t cols) { |
|
|
|
|
|
std::vector<T> result(rows * cols); |
|
|
|
|
|
for (size_t i = 0; i < rows; ++i) { |
|
|
|
|
|
for (size_t j = 0; j < cols; ++j) { |
|
|
|
|
|
result[j * rows + i] = mat[i * cols + j]; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
return result; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { |
|
|
std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { |
|
|
auto features = this->objects[0]; |
|
|
auto features = this->objects[0]; |
|
|
auto weights = this->objects[1]; |
|
|
auto weights = this->objects[1]; |
|
|
@@ -214,7 +226,23 @@ public: |
|
|
auto grad_features = std::make_shared<tensor::Tensor>(grad_features_shape); |
|
|
auto grad_features = std::make_shared<tensor::Tensor>(grad_features_shape); |
|
|
auto grad_weights_shape = {features->data->shape[1], gradient->shape[1]}; |
|
|
auto grad_weights_shape = {features->data->shape[1], gradient->shape[1]}; |
|
|
auto grad_weights = std::make_shared<tensor::Tensor>(grad_weights_shape); |
|
|
auto grad_weights = std::make_shared<tensor::Tensor>(grad_weights_shape); |
|
|
// 这里要调用两次arith:mm,是分别把哪两个矩阵相乘呢? |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 计算输入特征的梯度 |
|
|
|
|
|
// grad_features = gradient * weights^T |
|
|
|
|
|
auto weights_transposed = transpose(weights->data->data, weights->data->shape[0], weights->data->shape[1]); |
|
|
|
|
|
size_t m = gradient->shape[0]; |
|
|
|
|
|
size_t k = weights->data->shape[1]; |
|
|
|
|
|
size_t n = weights->data->shape[0]; |
|
|
|
|
|
arith::mm(gradient->data, weights_transposed, grad_features->data, m, k, n); |
|
|
|
|
|
|
|
|
|
|
|
// 计算权重的梯度 |
|
|
|
|
|
// grad_weights = features^T * gradient |
|
|
|
|
|
auto features_transposed = transpose(features->data->data, features->data->shape[0], features->data->shape[1]); |
|
|
|
|
|
m = features->data->shape[1]; |
|
|
|
|
|
k = features->data->shape[0]; |
|
|
|
|
|
n = gradient->shape[1]; |
|
|
|
|
|
arith::mm(features_transposed, gradient->data, grad_weights->data, m, k, n); |
|
|
|
|
|
|
|
|
return {grad_features, grad_weights}; |
|
|
return {grad_features, grad_weights}; |
|
|
} |
|
|
} |
|
|
}; //class Linear |
|
|
}; //class Linear |
|
|
@@ -223,17 +251,25 @@ class ReLU: public FunctionNode { |
|
|
public: |
|
|
public: |
|
|
ReLU(std::shared_ptr<Node> a) : FunctionNode(a) { |
|
|
ReLU(std::shared_ptr<Node> a) : FunctionNode(a) { |
|
|
// 补全这里 |
|
|
// 补全这里 |
|
|
|
|
|
this->data = this->forward(); |
|
|
} |
|
|
} |
|
|
std::shared_ptr<tensor::Tensor> forward() override { |
|
|
std::shared_ptr<tensor::Tensor> forward() override { |
|
|
// x: a Node with shape (batch_size x num_features) |
|
|
// x: a Node with shape (batch_size x num_features) |
|
|
auto outNode = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape); |
|
|
auto outNode = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape); |
|
|
// 补全这里,调用arith::vector_scalar_max |
|
|
// 补全这里,调用arith::vector_scalar_max |
|
|
|
|
|
arith::vector_scalar_max(this->objects[0]->data->data, outNode->data, 0.0f); |
|
|
return outNode; |
|
|
return outNode; |
|
|
} |
|
|
} |
|
|
std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { |
|
|
std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { |
|
|
auto grads = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape); |
|
|
auto grads = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape); |
|
|
// 补全这里,一个for循环 |
|
|
// 补全这里,一个for循环 |
|
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < grads->size; ++i) { |
|
|
|
|
|
if (this->objects[0]->data->data[i] > 0) { |
|
|
|
|
|
grads->data[i] = gradient->data[i]; |
|
|
|
|
|
} else { |
|
|
|
|
|
grads->data[i] = 0; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
return {grads}; |
|
|
return {grads}; |
|
|
} |
|
|
} |
|
|
}; // class ReLU |
|
|
}; // class ReLU |
|
|
@@ -249,14 +285,24 @@ class SquareLoss: public Loss { |
|
|
public: |
|
|
public: |
|
|
SquareLoss(std::shared_ptr<Node> a, std::shared_ptr<Node> b): Loss(a, b) { |
|
|
SquareLoss(std::shared_ptr<Node> a, std::shared_ptr<Node> b): Loss(a, b) { |
|
|
// 补全这里的代码 |
|
|
// 补全这里的代码 |
|
|
|
|
|
this->data = this->forward(); |
|
|
} |
|
|
} |
|
|
std::shared_ptr<tensor::Tensor> forward() { |
|
|
std::shared_ptr<tensor::Tensor> forward() { |
|
|
// a: a Node with shape (batch_size x dim) |
|
|
// a: a Node with shape (batch_size x dim) |
|
|
// b: a Node with shape (batch_size x dim) |
|
|
// b: a Node with shape (batch_size x dim) |
|
|
// 这个简单,就是要注意返回的res需要是一个tensor就行 |
|
|
// 这个简单,就是要注意返回的res需要是一个tensor就行 |
|
|
|
|
|
auto a = this->objects[0]->data; |
|
|
|
|
|
auto b = this->objects[1]->data; |
|
|
|
|
|
float sum_squared_diff = 0.0f; |
|
|
|
|
|
for (size_t i = 0; i < a->size; ++i) { |
|
|
|
|
|
float diff = a->data[i] - b->data[i]; |
|
|
|
|
|
sum_squared_diff += diff * diff; |
|
|
|
|
|
} |
|
|
// 修改下面的代码 |
|
|
// 修改下面的代码 |
|
|
|
|
|
float square_loss = sum_squared_diff / a->size; |
|
|
std::vector<size_t> res_shape = {1}; |
|
|
std::vector<size_t> res_shape = {1}; |
|
|
auto res = std::make_shared<tensor::Tensor>(res_shape); |
|
|
auto res = std::make_shared<tensor::Tensor>(res_shape); |
|
|
|
|
|
res->data[0] = square_loss; |
|
|
return res; |
|
|
return res; |
|
|
} |
|
|
} |
|
|
std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { |
|
|
std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { |
|
|
@@ -266,6 +312,12 @@ public: |
|
|
auto grad_a = std::make_shared<tensor::Tensor>(a->data->shape); |
|
|
auto grad_a = std::make_shared<tensor::Tensor>(a->data->shape); |
|
|
auto grad_b = std::make_shared<tensor::Tensor>(b->data->shape); |
|
|
auto grad_b = std::make_shared<tensor::Tensor>(b->data->shape); |
|
|
// 补全下面的代码 |
|
|
// 补全下面的代码 |
|
|
|
|
|
size_t n = a->data->size; |
|
|
|
|
|
for (size_t i = 0; i < n; ++i) { |
|
|
|
|
|
float diff = a->data->data[i] - b->data->data[i]; |
|
|
|
|
|
grad_a->data[i] = g * (2.0f / n) * diff; |
|
|
|
|
|
grad_b->data[i] = -g * (2.0f / n) * diff; |
|
|
|
|
|
} |
|
|
return {grad_a, grad_b}; |
|
|
return {grad_a, grad_b}; |
|
|
} |
|
|
} |
|
|
}; // class SquareLoss |
|
|
}; // class SquareLoss |
|
|
@@ -282,18 +334,52 @@ public: |
|
|
// 我们已经帮你写好log_softmax |
|
|
// 我们已经帮你写好log_softmax |
|
|
auto log_probs = log_softmax(this->objects[0]->data); |
|
|
auto log_probs = log_softmax(this->objects[0]->data); |
|
|
// 补全下面的代码,计算softmax loss |
|
|
// 补全下面的代码,计算softmax loss |
|
|
|
|
|
auto labels = this->objects[1]->data; |
|
|
|
|
|
// 样本数量 |
|
|
|
|
|
auto batch_size = log_probs->shape[0]; |
|
|
|
|
|
// 类别数量 |
|
|
|
|
|
auto num_classes = log_probs->shape[1]; |
|
|
|
|
|
// 初始化损失值 |
|
|
|
|
|
float loss = 0.0f; |
|
|
|
|
|
|
|
|
|
|
|
// 计算 softmax 损失 |
|
|
|
|
|
for (size_t i = 0; i < batch_size; ++i) { |
|
|
|
|
|
for (size_t j = 0; j < num_classes; ++j) { |
|
|
|
|
|
// 计算索引 |
|
|
|
|
|
size_t idx = i * num_classes + j; |
|
|
|
|
|
// 累加损失 |
|
|
|
|
|
loss += labels->data[idx] * log_probs->data[idx]; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
// 求平均损失 |
|
|
|
|
|
loss = -loss / batch_size; |
|
|
|
|
|
|
|
|
std::vector<size_t> res_shape = {1}; |
|
|
std::vector<size_t> res_shape = {1}; |
|
|
auto res = std::make_shared<tensor::Tensor>(res_shape); |
|
|
auto res = std::make_shared<tensor::Tensor>(res_shape); |
|
|
|
|
|
res->data[0] = loss; |
|
|
return res; |
|
|
return res; |
|
|
} |
|
|
} |
|
|
std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { |
|
|
std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { |
|
|
auto log_probs = log_softmax(this->objects[0]->data); |
|
|
auto log_probs = log_softmax(this->objects[0]->data); |
|
|
|
|
|
std::vector<float> probs(log_probs->data.size()); |
|
|
|
|
|
for (size_t i = 0; i < log_probs->data.size(); ++i) { |
|
|
|
|
|
probs[i] = std::exp(log_probs->data[i]); |
|
|
|
|
|
} |
|
|
auto labels = this->objects[1]->data; |
|
|
auto labels = this->objects[1]->data; |
|
|
auto batch_size = log_probs->shape[0]; |
|
|
auto batch_size = log_probs->shape[0]; |
|
|
auto num_classes = log_probs->shape[1]; |
|
|
auto num_classes = log_probs->shape[1]; |
|
|
auto grad_logits = std::make_shared<tensor::Tensor>(log_probs->shape); |
|
|
auto grad_logits = std::make_shared<tensor::Tensor>(log_probs->shape); |
|
|
auto grad_labels = std::make_shared<tensor::Tensor>(labels->shape); |
|
|
auto grad_labels = std::make_shared<tensor::Tensor>(labels->shape); |
|
|
// 补全下面的代码 |
|
|
// 补全下面的代码 |
|
|
|
|
|
for (size_t i = 0; i < batch_size; ++i) { |
|
|
|
|
|
for (size_t j = 0; j < num_classes; ++j) { |
|
|
|
|
|
size_t idx = i * num_classes + j; |
|
|
|
|
|
// 根据公式计算梯度 |
|
|
|
|
|
grad_logits->data[idx] = (probs[idx] - labels->data[idx]) / batch_size; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
std::fill(grad_labels->data.begin(), grad_labels->data.end(), 0.0f); |
|
|
return {grad_logits, grad_labels}; |
|
|
return {grad_logits, grad_labels}; |
|
|
} |
|
|
} |
|
|
}; // class SoftmaxLoss |
|
|
}; // class SoftmaxLoss |
|
|
|