| @@ -0,0 +1,15 @@ | |||
| cmake_minimum_required(VERSION 3.19) | |||
| project(uctc) | |||
| set(CMAKE_CXX_STANDARD 17) | |||
| set(CXX g++) | |||
| set(CMAKE_CXX_FLAGS -O3) | |||
| set(PYBIND11_DIR /home/hexu/miniconda3/lib/python3.11/site-packages/pybind11) | |||
| set(PYBIND11_FINDPYTHON ON) | |||
| find_package(pybind11 CONFIG REQUIRED PATHS ${PYBIND11_DIR}) | |||
| pybind11_add_module(uctc uctc.cc math/arith.cc operators/nn.cc operators/ops.cc tensor/tensor.cc) | |||
| add_custom_command( | |||
| TARGET uctc POST_BUILD | |||
| COMMAND ${CMAKE_COMMAND} -E echo "Changing directory and running Python script for generate interpreter annotations" | |||
| COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR} pybind11-stubgen uctc --output-dir . | |||
| ) | |||
| @@ -0,0 +1,14 @@ | |||
| #include "arith.h" | |||
| namespace arith { | |||
| float sqrt(float x) { | |||
| return sqrtf(x); | |||
| } | |||
| float mean(const std::vector<int>& x) { | |||
| return std::accumulate(x.begin(), x.end(), 0) / x.size(); | |||
| } | |||
| } | |||
| @@ -0,0 +1,20 @@ | |||
| #pragma once | |||
| #include <cmath> | |||
| #include <vector> | |||
| #include <numeric> | |||
| namespace arith { | |||
| float sqrt(float x); | |||
| float mean(const std::vector<int>& x); | |||
| template<typename T> | |||
| void mm(const std::vector<T>& a, const std::vector<T>& b, std::vector<T>& c, size_t m, size_t k, size_t n) { | |||
| // 补全这里,谢谢 | |||
| } | |||
| template<typename T> | |||
| void vector_scalar_max(const std::vector<T>& a, std::vector<T> &b, T scalar) { | |||
| // 补全这里,谢谢 | |||
| } | |||
| } | |||
| @@ -0,0 +1,32 @@ | |||
| #include "autodiff.h" | |||
| namespace autodiff { | |||
| std::vector<std::shared_ptr<ScalarFunction>> topoSort(const std::vector<std::shared_ptr<ScalarFunction>>& scalars) { | |||
| std::vector<std::shared_ptr<ScalarFunction>> sorted; | |||
| std::vector<std::shared_ptr<ScalarFunction>> frontier; | |||
| std::unordered_map<std::shared_ptr<ScalarFunction>, int> degree; | |||
| for (auto it: scalars) { | |||
| if (it->degree == 0) { | |||
| frontier.push_back(it); | |||
| } | |||
| else { | |||
| degree.insert({it, it->degree}); | |||
| } | |||
| } | |||
| while (!frontier.empty()) { | |||
| auto back = frontier.back(); | |||
| sorted.push_back(back); | |||
| for (auto &it: degree) { | |||
| if (it.second > 0 && it.first == back) { | |||
| it.second--; | |||
| if (it.second == 0) { | |||
| frontier.push_back(it.first); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return sorted; | |||
| } | |||
| } | |||
| @@ -0,0 +1,211 @@ | |||
| #pragma once | |||
| #include <vector> | |||
| #include <memory> | |||
| #include <cmath> | |||
| #include <unordered_map> | |||
| namespace autodiff { | |||
| template<typename T, typename F> | |||
| auto central_difference(std::vector<T>& vec, F func, std::size_t arg, float epsilon = 1e-6) { | |||
| // 补全函数,并修改return语句 | |||
| return 0; | |||
| } | |||
| class ScalarFunction { | |||
| public: | |||
| float data; | |||
| float grad; | |||
| int degree = 0; | |||
| public: | |||
| ScalarFunction() {} | |||
| }; // class ScalarFunction | |||
| class ConstantScalar: public ScalarFunction { | |||
| public: | |||
| ConstantScalar(float data): ScalarFunction() { | |||
| this->data = data; | |||
| } | |||
| }; // class ConstantScalar | |||
| class Add: public ScalarFunction { | |||
| public: | |||
| std::shared_ptr<ScalarFunction> a; | |||
| std::shared_ptr<ScalarFunction> b; | |||
| public: | |||
| // 思考这个构造函数的写法(或让LLM进行解释) | |||
| Add(std::shared_ptr<ScalarFunction> a, std::shared_ptr<ScalarFunction> b): a(a), b(b) { | |||
| this->data = a->data + b->data; | |||
| this->degree = 2; | |||
| } | |||
| float forward() { | |||
| // 修改这里的return | |||
| return 0; | |||
| } | |||
| std::vector<float> backward(float d_input) { | |||
| // 修改这里的return | |||
| return {0, 0}; | |||
| } | |||
| }; // class Add | |||
| class Log: public ScalarFunction { | |||
| public: | |||
| std::shared_ptr<ScalarFunction> a; | |||
| public: | |||
| Log(std::shared_ptr<ScalarFunction> a): a(a) { | |||
| this->data = this->forward(); | |||
| this->degree = 1; | |||
| } | |||
| float forward() { | |||
| // 补全这里的return语句 | |||
| return 0.0f; | |||
| } | |||
| std::vector<float> backward(float d_input) { | |||
| // 算了,我来帮你写求导的部分吧 | |||
| // 估计你已经忘记$log(x)$求导是什么了 | |||
| return {(1.0f * d_input / a->data)}; | |||
| } | |||
| }; // class Log | |||
| class Mul: public ScalarFunction { | |||
| public: | |||
| std::shared_ptr<ScalarFunction> a; | |||
| std::shared_ptr<ScalarFunction> b; | |||
| public: | |||
| Mul(std::shared_ptr<ScalarFunction> a, std::shared_ptr<ScalarFunction> b) : a(a), b(b) { | |||
| this->data = this->forward(); | |||
| this->degree = 2; | |||
| } | |||
| float forward() { | |||
| // 修改这里的return | |||
| return 0; | |||
| } | |||
| std::vector<float> backward(float d_input) { | |||
| // 修改这里的return | |||
| return {0, 0}; | |||
| } | |||
| }; // class Mul | |||
| class Inv: public ScalarFunction { | |||
| public: | |||
| std::shared_ptr<ScalarFunction> a; | |||
| public: | |||
| Inv(std::shared_ptr<ScalarFunction> a): a(a) { | |||
| this->data = this->forward(); | |||
| this->degree = 1; | |||
| } | |||
| float forward() { | |||
| return 1.0f / a->data; | |||
| } | |||
| std::vector<float> backward(float d_input) { | |||
| // 修改这里的return语句 | |||
| // 1/x求导是-1/x^2 | |||
| return {0.0f}; | |||
| } | |||
| }; // class Inv | |||
| class Sigmoid: public ScalarFunction { | |||
| public: | |||
| std::shared_ptr<ScalarFunction> a; | |||
| public: | |||
| Sigmoid(std::shared_ptr<ScalarFunction> a): a(a) { | |||
| this->data = this->forward(); | |||
| this->degree = 1; | |||
| } | |||
| float forward() { | |||
| if (this->a->data >= 0.0) { | |||
| return 1.0 / (1.0 + expf(-this->a->data)); | |||
| } | |||
| else { | |||
| return expf(this->a->data) / (1.0 + expf(this->a->data)); | |||
| } | |||
| } | |||
| std::vector<float> backward(float d_input) { | |||
| // 你还是来求一下导吧,预防上大学以后变傻了 | |||
| // 补全这里的代码 | |||
| return {0.0f}; | |||
| } | |||
| }; // class Sigmoid | |||
| // for testing | |||
| bool test_central_difference() { | |||
| std::vector<float> x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f}; | |||
| auto func = [](const std::vector<float>& x) -> float { | |||
| return x[0] + x[1] + x[2] + x[3] + x[4]; | |||
| }; | |||
| auto grad = central_difference(x, func, 2); | |||
| if (abs(grad-1.0f) > 1e-4) { | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| bool test_addscalar() { | |||
| auto a = std::make_shared<ConstantScalar>(1.0f); | |||
| auto b = std::make_shared<ConstantScalar>(2.0f); | |||
| auto c = std::make_shared<Add>(a, b); | |||
| if (c->data != 3.0f) { | |||
| return false; | |||
| } | |||
| auto res = c->backward(2.0f); | |||
| auto a_grad = res[0]; | |||
| auto b_grad = res[1]; | |||
| if (a_grad != 2.0f || b_grad != 2.0f) { | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| bool test_mulscalar() { | |||
| auto a = std::make_shared<ConstantScalar>(2.0f); | |||
| auto b = std::make_shared<ConstantScalar>(3.0f); | |||
| auto c = std::make_shared<Mul>(a, b); | |||
| if (c->data != 6.0f) { | |||
| return false; | |||
| } | |||
| auto res = c->backward(2.0f); | |||
| auto a_grad = res[0]; | |||
| auto b_grad = res[1]; | |||
| if (a_grad != 6.0f || b_grad != 4.0f) { | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| bool test_logscalar() { | |||
| auto a = std::make_shared<ConstantScalar>(2.0f); | |||
| auto b = std::make_shared<Log>(a); | |||
| if (abs(b->data - logf(2.0f)) > 1e-4) { | |||
| return false; | |||
| } | |||
| auto res = b->backward(2.0f); | |||
| auto a_grad = res[0]; | |||
| if (abs(a_grad - 1.0f) > 1e-4) { | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| bool test_invscalar() { | |||
| auto a = std::make_shared<ConstantScalar>(2.0f); | |||
| auto b = std::make_shared<Inv>(a); | |||
| if (abs(b->data - 0.5f) > 1e-4) { | |||
| return false; | |||
| } | |||
| auto res = b->backward(2.0f); | |||
| auto a_grad = res[0]; | |||
| if (abs(a_grad + 0.5f) > 1e-4) { | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| bool test_sigmoidscalar() { | |||
| auto a = std::make_shared<ConstantScalar>(2.0f); | |||
| auto b = std::make_shared<Sigmoid>(a); | |||
| // TODO:麻烦自己写下测试用例,谢谢 | |||
| // 禁止直接return true,世界上最聪明的智能人工将会逐一检查这段代码 | |||
| return false; | |||
| } | |||
| } | |||
| @@ -0,0 +1,87 @@ | |||
| #include "nn.h" | |||
| namespace nn { | |||
| std::shared_ptr<tensor::Tensor> log_softmax(std::shared_ptr<tensor::Tensor> logits) { | |||
| auto batch_size = logits->shape[0]; | |||
| auto num_classes = logits->shape[1]; | |||
| auto log_probs_shape = {batch_size, num_classes}; | |||
| auto log_probs = std::make_shared<tensor::Tensor>(log_probs_shape); | |||
| for (auto i = 0; i < batch_size; i++) { | |||
| auto max_logit = logits->data[i * num_classes]; | |||
| for (auto j = 1; j < num_classes; j++) { | |||
| max_logit = max_logit > logits->data[i * num_classes + j] ? max_logit : logits->data[i * num_classes + j]; | |||
| } | |||
| auto sum_exp = 0.0; | |||
| for (auto j = 0; j < num_classes; j++) { | |||
| log_probs->data[i * num_classes + j] = logits->data[i * num_classes + j] - max_logit; | |||
| sum_exp += exp(log_probs->data[i * num_classes + j]); | |||
| } | |||
| // calculate log(softmax) | |||
| auto log_sum_exp = log(sum_exp); | |||
| for (auto j = 0; j < num_classes; j++) { | |||
| log_probs->data[i * num_classes + j] -= log_sum_exp; | |||
| } | |||
| } | |||
| return log_probs; | |||
| } | |||
| std::vector<std::shared_ptr<tensor::Tensor>> gradients(std::shared_ptr<Loss> loss, std::vector<std::shared_ptr<Node>> parameters) { | |||
| loss->used = true; | |||
| std::unordered_set<std::shared_ptr<Node>> nodes; | |||
| std::vector<std::shared_ptr<Node>> tape; | |||
| // 递归遍历图并构建计算图 | |||
| std::function<void(std::shared_ptr<Node>)> visit = [&](std::shared_ptr<Node> node) { | |||
| if (nodes.find(node) == nodes.end()) { | |||
| for (const auto& parent : node->get_parents()) { | |||
| visit(parent); | |||
| } | |||
| nodes.insert(node); | |||
| tape.push_back(node); | |||
| } | |||
| }; | |||
| visit(loss); | |||
| for (const auto& param : parameters) { | |||
| nodes.insert(param); | |||
| } | |||
| std::unordered_map<std::shared_ptr<Node>, std::shared_ptr<tensor::Tensor>> grads; | |||
| for (const auto& node : nodes) { | |||
| grads[node] = std::make_shared<tensor::Tensor>(node->data->shape); | |||
| } | |||
| grads[loss] = std::make_shared<tensor::Tensor>(loss->data->shape); | |||
| grads[loss]->data[0] = 1.0; | |||
| for (auto it = tape.rbegin(); it != tape.rend(); it++) { | |||
| // std::cout << "tape it: " << std::endl; | |||
| auto node = *it; | |||
| // if (node->data->shape[0] == 1) { | |||
| // std::cout << "coming to squareloss" << std::endl; | |||
| // } | |||
| auto parent_grads = node->backward(grads[node]); | |||
| auto parents = node->get_parents(); | |||
| for (size_t i = 0; i < parents.size(); i++) { | |||
| // std::cout << "this grad shape: " << grads[parents[i]]->data.size() << std::endl; | |||
| for (auto ind = 0; ind < parents[i]->data->size; ind++) { | |||
| grads[parents[i]]->data[ind] += parent_grads[i]->data[ind]; | |||
| } | |||
| } | |||
| } | |||
| std::vector<std::shared_ptr<tensor::Tensor>> result; | |||
| for (const auto& param : parameters) { | |||
| result.emplace_back(grads[param]); | |||
| } | |||
| // std::cout << "len(result): " << result.size() << std::endl; | |||
| return result; | |||
| } | |||
| } | |||
| @@ -0,0 +1,274 @@ | |||
| #pragma once | |||
| #include <vector> | |||
| #include <memory> | |||
| #include <unordered_set> | |||
| #include <unordered_map> | |||
| #include <algorithm> | |||
| #include <pybind11/pybind11.h> | |||
| #include <pybind11/numpy.h> | |||
| #include <iostream> | |||
| #include "../tensor/tensor.h" | |||
| #include "../math/arith.h" | |||
| namespace py = pybind11; | |||
| namespace nn { | |||
| class Node { | |||
| public: | |||
| std::shared_ptr<tensor::Tensor> data; | |||
| std::vector<std::shared_ptr<Node>> objects; | |||
| std::vector<std::shared_ptr<tensor::Tensor>> gradient; | |||
| public: | |||
| Node() {} | |||
| virtual std::shared_ptr<tensor::Tensor> forward() = 0; | |||
| virtual std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) = 0; | |||
| std::vector<std::shared_ptr<Node>> get_parents() { | |||
| return this->objects; | |||
| } | |||
| std::vector<float> get_data() { | |||
| return this->data->data; | |||
| } | |||
| std::shared_ptr<tensor::Tensor> get_tensor() { | |||
| return this->data; | |||
| } | |||
| // virtual void update(std::shared_ptr<tensor::Tensor> grad, float lr) = 0; | |||
| // virtual void zero_grad() = 0; | |||
| virtual ~Node() {} | |||
| }; | |||
| class DataNode: public Node { | |||
| public: | |||
| DataNode() {} | |||
| }; // class DataNode | |||
| class Parameter: public DataNode { | |||
| public: | |||
| // Parameter(const std::vector<std::size_t>& shape) { | |||
| // this->data = std::make_shared<tensor::Tensor>(shape, true); | |||
| // } | |||
| Parameter(py::array_t<float> array) { | |||
| py::buffer_info info = array.request(); | |||
| float* dataPtr = static_cast<float*>(info.ptr); | |||
| std::vector<std::size_t> shape = {}; | |||
| for (auto &it: info.shape) { | |||
| shape.push_back(it); | |||
| } | |||
| auto tensor = std::make_shared<tensor::Tensor>(shape); | |||
| std::vector<float> result(dataPtr, dataPtr + info.size); | |||
| tensor->data = result; | |||
| this->data = tensor; | |||
| } | |||
| std::shared_ptr<tensor::Tensor> forward() { | |||
| return this->data; | |||
| }; | |||
| std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) { | |||
| return {gradient}; | |||
| }; | |||
| void update(std::shared_ptr<tensor::Tensor> grad, double lr) { | |||
| for (auto i = 0; i < this->data->size; i++) { | |||
| this->data->data[i] -= lr * grad->data[i]; | |||
| } | |||
| } | |||
| }; // class Parameter | |||
| class Constant: public DataNode { | |||
| public: | |||
| Constant(std::shared_ptr<tensor::Tensor> data) { | |||
| this->data = data; | |||
| } | |||
| Constant(py::array_t<float> array) { | |||
| this->data = tensor::pyarray_to_tensor(array); | |||
| } | |||
| std::shared_ptr<tensor::Tensor> forward() { | |||
| return this->data; | |||
| }; | |||
| std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) { | |||
| return {gradient}; | |||
| }; | |||
| // void update(std::shared_ptr<tensor::Tensor> grad, float lr) {} | |||
| }; // class Constant | |||
| class FunctionNode: public Node { | |||
| public: | |||
| FunctionNode(std::shared_ptr<Node> a, std::shared_ptr<Node> b) { | |||
| this->objects.emplace_back(a); | |||
| this->objects.emplace_back(b); | |||
| } | |||
| FunctionNode(std::shared_ptr<Node> a) { | |||
| this->objects.emplace_back(a); | |||
| } | |||
| std::shared_ptr<tensor::Tensor> forward() override { | |||
| return nullptr; | |||
| } | |||
| }; //class FunctionNode | |||
| class Add: public FunctionNode { | |||
| public: | |||
| Add(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) { | |||
| this->data = this->forward(); | |||
| } | |||
| std::shared_ptr<tensor::Tensor> forward() override { | |||
| auto a = this->objects[0]; | |||
| auto b = this->objects[1]; | |||
| auto outNode = std::make_shared<tensor::Tensor>(a->data->shape); | |||
| for (auto i = 0; i < a->data->size; i++) { | |||
| outNode->data[i] = a->data->data[i] + b->data->data[i]; | |||
| } | |||
| return outNode; | |||
| } | |||
| std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { | |||
| // assertion needed | |||
| return {gradient, gradient}; | |||
| } | |||
| }; | |||
| class AddBias: public FunctionNode { | |||
| public: | |||
| AddBias(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) { | |||
| this->data = this->forward(); | |||
| } | |||
| std::shared_ptr<tensor::Tensor> forward() override { | |||
| // features: a Node with shape (batch_size x num_features) | |||
| // bias: a Node with shape (1 x num_features) | |||
| auto features = this->objects[0]; | |||
| auto bias = this->objects[1]; | |||
| auto outNode = std::make_shared<tensor::Tensor>(features->data->shape); | |||
| // for循环写加法总会写吧🤔 | |||
| // 补全这里的代码 | |||
| return outNode; | |||
| } | |||
| std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { | |||
| // assertion needed | |||
| auto g_bias = std::make_shared<tensor::Tensor>(this->objects[1]->data->shape); | |||
| // 补全这里的代码 | |||
| return {gradient, g_bias}; | |||
| } | |||
| std::vector<float> get_data() { | |||
| return this->data->data; | |||
| } | |||
| }; // class AddBias | |||
| class Linear: public FunctionNode { | |||
| public: | |||
| Linear(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) { | |||
| // 这段代码就一行,参考下别的类是怎么写的呢? | |||
| // 在这里补全 | |||
| } | |||
| std::shared_ptr<tensor::Tensor> forward() override { | |||
| // features: (batch_size x input_features) | |||
| auto features = this->objects[0]; | |||
| // weights: (input_features x output_features) | |||
| auto weights = this->objects[1]; | |||
| auto m = features->data->shape[0]; | |||
| auto k = features->data->shape[1]; | |||
| auto n = weights->data->shape[1]; | |||
| // std::cout << m << " " << n << " " << k << std::endl; | |||
| // output: (batch_size x output_features) | |||
| auto shape = {m, n}; | |||
| auto outNode = std::make_shared<tensor::Tensor>(shape); | |||
| // 实际上你需要补全的是arith::mm函数,快去找找它在哪里 | |||
| // 其余部分不需要动 | |||
| arith::mm(features->data->data, weights->data->data, outNode->data, m, k, n); | |||
| return outNode; | |||
| } | |||
| std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { | |||
| auto features = this->objects[0]; | |||
| auto weights = this->objects[1]; | |||
| // gradient.shape[0] == features.shape[0] | |||
| // gradient.shape[1] == weights.shape[1] | |||
| auto grad_features_shape = {gradient->shape[0], weights->data->shape[0]}; | |||
| auto grad_features = std::make_shared<tensor::Tensor>(grad_features_shape); | |||
| auto grad_weights_shape = {features->data->shape[1], gradient->shape[1]}; | |||
| auto grad_weights = std::make_shared<tensor::Tensor>(grad_weights_shape); | |||
| // 这里要调用两次arith:mm,是分别把哪两个矩阵相乘呢? | |||
| return {grad_features, grad_weights}; | |||
| } | |||
| }; //class Linear | |||
| class ReLU: public FunctionNode { | |||
| public: | |||
| ReLU(std::shared_ptr<Node> a) : FunctionNode(a) { | |||
| // 补全这里 | |||
| } | |||
| std::shared_ptr<tensor::Tensor> forward() override { | |||
| // x: a Node with shape (batch_size x num_features) | |||
| auto outNode = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape); | |||
| // 补全这里,调用arith::vector_scalar_max | |||
| return outNode; | |||
| } | |||
| std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { | |||
| auto grads = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape); | |||
| // 补全这里,一个for循环 | |||
| return {grads}; | |||
| } | |||
| }; // class ReLU | |||
| class Loss: public FunctionNode { | |||
| public: | |||
| bool used = false; | |||
| public: | |||
| Loss(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {} | |||
| }; | |||
| class SquareLoss: public Loss { | |||
| public: | |||
| SquareLoss(std::shared_ptr<Node> a, std::shared_ptr<Node> b): Loss(a, b) { | |||
| // 补全这里的代码 | |||
| } | |||
| std::shared_ptr<tensor::Tensor> forward() { | |||
| // a: a Node with shape (batch_size x dim) | |||
| // b: a Node with shape (batch_size x dim) | |||
| // 这个简单,就是要注意返回的res需要是一个tensor就行 | |||
| // 修改下面的代码 | |||
| std::vector<size_t> res_shape = {1}; | |||
| auto res = std::make_shared<tensor::Tensor>(res_shape); | |||
| return res; | |||
| } | |||
| std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { | |||
| float g = gradient->data[0]; | |||
| auto a = this->objects[0]; | |||
| auto b = this->objects[1]; | |||
| auto grad_a = std::make_shared<tensor::Tensor>(a->data->shape); | |||
| auto grad_b = std::make_shared<tensor::Tensor>(b->data->shape); | |||
| // 补全下面的代码 | |||
| return {grad_a, grad_b}; | |||
| } | |||
| }; // class SquareLoss | |||
| std::shared_ptr<tensor::Tensor> log_softmax(std::shared_ptr<tensor::Tensor> logits); | |||
| class SoftmaxLoss: public Loss { | |||
| public: | |||
| SoftmaxLoss(std::shared_ptr<Node> logits, std::shared_ptr<Node> labels): Loss(logits, labels) { | |||
| this->data = this->forward(); | |||
| } | |||
| std::shared_ptr<tensor::Tensor> forward() { | |||
| // 我们已经帮你写好log_softmax | |||
| auto log_probs = log_softmax(this->objects[0]->data); | |||
| // 补全下面的代码,计算softmax loss | |||
| std::vector<size_t> res_shape = {1}; | |||
| auto res = std::make_shared<tensor::Tensor>(res_shape); | |||
| return res; | |||
| } | |||
| std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override { | |||
| auto log_probs = log_softmax(this->objects[0]->data); | |||
| auto labels = this->objects[1]->data; | |||
| auto batch_size = log_probs->shape[0]; | |||
| auto num_classes = log_probs->shape[1]; | |||
| auto grad_logits = std::make_shared<tensor::Tensor>(log_probs->shape); | |||
| auto grad_labels = std::make_shared<tensor::Tensor>(labels->shape); | |||
| // 补全下面的代码 | |||
| return {grad_logits, grad_labels}; | |||
| } | |||
| }; // class SoftmaxLoss | |||
| std::vector<std::shared_ptr<tensor::Tensor>> gradients(std::shared_ptr<Loss> loss, std::vector<std::shared_ptr<Node>> parameters); | |||
| } | |||
| @@ -0,0 +1,54 @@ | |||
| #include "ops.h" | |||
| namespace operators { | |||
| static float epsilon = 1e-6; | |||
| float is_close(float x, float y) { | |||
| // 请修改这里的return语句 | |||
| return 0.0; | |||
| } | |||
| float sigmoid(float x) { | |||
| // 请修改这里的return语句 | |||
| return 0.0; | |||
| } | |||
| float relu(float x) { | |||
| // 请修改这里的return语句 | |||
| return 0.0; | |||
| } | |||
| float inv(float x) { | |||
| // 请修改这里的return语句 | |||
| return 0.0; | |||
| } | |||
| float inv_back(float x, float d) { | |||
| // 请修改这里的return语句 | |||
| return 0.0; | |||
| } | |||
| float relu_back(float x, float d) { | |||
| // 请修改这里的return语句 | |||
| return 0.0; | |||
| } | |||
| auto sumList(const std::vector<float>& vec) -> float { | |||
| return reduce(vec, 0.0f, add<float>); | |||
| } | |||
| auto prodList(const std::vector<float>& vec) -> float { | |||
| // 请修改这里的return语句 | |||
| return 0.0f; | |||
| } | |||
| auto addLists(const std::vector<float>& vec1, const std::vector<float>& vec2) -> std::vector<float> { | |||
| // 请修改这里的return语句 | |||
| return std::vector<float>(1, 0.0f); | |||
| } | |||
| auto negList(const std::vector<float>& vec) -> std::vector<float> { | |||
| // 请修改这里的return语句 | |||
| return std::vector<float>(1, 0.0f); | |||
| } | |||
| } | |||
| @@ -0,0 +1,88 @@ | |||
| #pragma once | |||
| #include <cmath> | |||
| #include <functional> | |||
| #include <vector> | |||
| #include <algorithm> | |||
| #include <stdexcept> | |||
| #include <numeric> | |||
| namespace operators { | |||
| template<typename T> | |||
| T mul(T a, T b) { | |||
| return 0; // 请修改这里的return语句 | |||
| } | |||
| template<typename T> | |||
| T id(T a) { | |||
| return 0; // 请修改这里的return语句 | |||
| } | |||
| template<typename T> | |||
| T add(T a, T b) { | |||
| return 0; // 请修改这里的return语句 | |||
| } | |||
| template<typename T> | |||
| T neg(T a) { | |||
| return 0; // 请修改这里的return语句 | |||
| } | |||
| template<typename T> | |||
| float lt(T a, T b) { | |||
| return 0.0; // 请修改这里的return语句 | |||
| } | |||
| template<typename T> | |||
| float eq(T a, T b) { | |||
| return 0.0; // 请修改这里的return语句 | |||
| } | |||
| template<typename T> | |||
| T max(T a, T b) { | |||
| return 0; // 请修改这里的return语句 | |||
| } | |||
| template<typename T, typename F> | |||
| auto map(const std::vector<T>& vec, F func) -> std::vector<decltype(func(std::declval<T>()))> { | |||
| std::vector<decltype(func(std::declval<T>()))> result; | |||
| result.reserve(vec.size()); | |||
| std::transform(vec.begin(), vec.end(), std::back_inserter(result), func); | |||
| return result; | |||
| } | |||
| template <typename T1, typename T2, typename F> | |||
| auto zipWith(const std::vector<T1>& vec1, const std::vector<T2>& vec2, F func) | |||
| -> std::vector<decltype(func(std::declval<T1>(), std::declval<T2>()))> { | |||
| if (vec1.size() != vec2.size()) { | |||
| // 我们已经在这里throw一个异常 | |||
| throw std::invalid_argument("Vectors must have the same size"); | |||
| } | |||
| // 请在这里补全其他部分 | |||
| // 提醒:可以使用push_back函数向vector添加元素 | |||
| // 再给你降一点难度:这里需要仿照map函数神明一个result变量。 | |||
| return std::vector<decltype(func(std::declval<T1>(), std::declval<T2>()))>(1); // 这里记得改掉,改成result | |||
| } | |||
| template<typename T, typename F> | |||
| auto reduce(const std::vector<T>& vec, T init, F func) -> T { | |||
| return std::accumulate(vec.begin(), vec.end(), init, func); | |||
| } | |||
| float is_close(float x, float y); | |||
| float sigmoid(float x); | |||
| float relu(float x); | |||
| float inv(float x); | |||
| float inv_back(float x, float d); | |||
| float relu_back(float x, float d); | |||
| auto sumList(const std::vector<float>& vec) -> float; | |||
| auto prodList(const std::vector<float>& vec) -> float; | |||
| auto addLists(const std::vector<float>& vec1, const std::vector<float>& vec2) -> std::vector<float>; | |||
| auto negList(const std::vector<float>& vec) -> std::vector<float>; | |||
| } | |||
| @@ -0,0 +1,12 @@ | |||
| #include "pyarray.h" | |||
| namespace pyarr { | |||
| std::vector<float> ndarray_to_vector(py::array_t<float> array) { | |||
| py::buffer_info info = array.request(); | |||
| float* dataPtr = static_cast<float*>(info.ptr); | |||
| std::vector<float> result(dataPtr, dataPtr + info.size); | |||
| return result; | |||
| } | |||
| } | |||
| @@ -0,0 +1,10 @@ | |||
| #include <pybind11/numpy.h> | |||
| #include <pybind11/pybind11.h> | |||
| namespace py = pybind11; | |||
| namespace pyarr { | |||
| std::vector<float> ndarray_to_vector(py::array_t<float> array); | |||
| } | |||
| @@ -0,0 +1,76 @@ | |||
| #include "tensor.h" | |||
| namespace tensor { | |||
| std::shared_ptr<Tensor> Tensor::transpose() { | |||
| // 放心,下面的代码暂时不会被触发,我们假定所有的tensor都是2维的 | |||
| // if (shape.size() != 2) { | |||
| // throw std::runtime_error("Transpose is only supported for 2D tensors."); | |||
| // } | |||
| // 这里能够获得矩阵的行数和列数,但是我们是使用一个一维的vector来存储数据的。该如何实现“转置”呢? | |||
| std::size_t rows = shape[0]; | |||
| std::size_t cols = shape[1]; | |||
| std::vector<size_t> new_shape = {cols, rows}; | |||
| // 你知道这里的size变量在哪里定义的吗?在VSCode里面安装C/C++ Extension Pack后,按下ctrl键并单击变量size,VSCode就会把你导向定义这个变量的地方! | |||
| std::vector<float> transposed_data(size); | |||
| // 请在这里写转置的代码 | |||
| // 请阅读关于Tensor的定义,在这里创建一个新的Tensor | |||
| // 注意,要使用shared_ptr哦! | |||
| return std::make_shared<Tensor>(new_shape); | |||
| } | |||
| std::shared_ptr<Tensor> pyarray_to_tensor(py::array_t<float> array) { | |||
| py::buffer_info info = array.request(); | |||
| float* dataPtr = static_cast<float*>(info.ptr); | |||
| std::vector<std::size_t> shape = {}; | |||
| for (auto &it: info.shape) { | |||
| shape.push_back(it); | |||
| } | |||
| auto tensor = std::make_shared<Tensor>(shape); | |||
| std::vector<float> result(dataPtr, dataPtr + info.size); | |||
| tensor->data = result; | |||
| return tensor; | |||
| } | |||
| std::shared_ptr<Tensor> argmax(const std::shared_ptr<Tensor>& tensor, int axis) { | |||
| // you only need to handle the two dimensional tensor, and the axis can be either 0 or 1 | |||
| // the tensor's shape is (batch_size, features) | |||
| // if the axis is 0, it outputs a tensor (1, features) | |||
| // if the axis is 1, it outputs a tensor (batch_size, 1) | |||
| // compute the output's shape | |||
| std::vector<std::size_t> output_shape = tensor->shape; | |||
| output_shape.erase(output_shape.begin() + axis); | |||
| auto result = std::make_shared<Tensor>(output_shape); | |||
| // 这个问题似乎有点难,所以我们决定给你送点分。一个简单的办法是分axis为0还是为1来进行讨论,反正我们已经把问题简化为了,在一个二维的tensor里面,找到每一行或者每一列的最大值,并输出一个一维的tensor。 | |||
| // 补全这里的代码。 | |||
| return result; | |||
| } | |||
| std::shared_ptr<Tensor> mean(const std::shared_ptr<Tensor>& tensor) { | |||
| std::vector<std::size_t> shape = {1}; | |||
| auto result = std::make_shared<Tensor>(shape); | |||
| auto sum = 0.0f; | |||
| for (auto &it: tensor->data) { | |||
| sum += it; | |||
| } | |||
| sum /= tensor->size; | |||
| result->data[0] = sum; | |||
| return result; | |||
| } | |||
| std::shared_ptr<Tensor> exp(const std::shared_ptr<Tensor>& tensor) { | |||
| auto result = std::make_shared<Tensor>(tensor->shape); | |||
| for (auto i = 0; i < tensor->size; i++) { | |||
| result->data[i] = expf(tensor->data[i]); | |||
| } | |||
| return result; | |||
| } | |||
| } | |||
| @@ -0,0 +1,92 @@ | |||
| #pragma once | |||
| #include <numeric> | |||
| #include <random> | |||
| #include <vector> | |||
| #include <memory> | |||
| #include <stdexcept> | |||
| #include <pybind11/pybind11.h> | |||
| #include <pybind11/numpy.h> | |||
| namespace py = pybind11; | |||
| namespace tensor { | |||
| class Tensor { | |||
| public: | |||
| std::vector<float> data; | |||
| std::vector<std::size_t> shape; | |||
| std::size_t size; | |||
| public: | |||
| Tensor(const std::vector<std::size_t>& shape, bool rand_init = false) { | |||
| this->size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()); | |||
| this->data.resize(this->size); | |||
| this->shape = shape; | |||
| if (rand_init) { | |||
| double limit = std::sqrt(3.0 / ((shape[0] + shape[1]) / 2.0)); | |||
| std::mt19937 gen(42); | |||
| std::uniform_real_distribution<float> dis(-limit, limit); | |||
| for (std::size_t i = 0; i < this->size; ++i) { | |||
| this->data[i] = dis(gen); | |||
| } | |||
| } | |||
| } | |||
| std::shared_ptr<Tensor> transpose(); | |||
| Tensor operator+(const Tensor& other) const { | |||
| if (this->shape != other.shape) { | |||
| throw std::runtime_error("Shapes do not match"); | |||
| } | |||
| Tensor result(this->shape); | |||
| for (std::size_t i = 0; i < this->size; ++i) { | |||
| result.data[i] = this->data[i] + other.data[i]; | |||
| } | |||
| return result; | |||
| } | |||
| Tensor operator=(const Tensor& other) const { | |||
| if (this->shape != other.shape) { | |||
| throw std::runtime_error("Shapes do not match"); | |||
| } | |||
| Tensor result(this->shape); | |||
| for (auto i = 0; i < this->size; i++) { | |||
| result.data[i] = (this->data[i] == other.data[i]); | |||
| } | |||
| return result; | |||
| } | |||
| std::vector<std::size_t> get_shape() const { | |||
| return this->shape; | |||
| } | |||
| std::vector<float> get_data() const { | |||
| return this->data; | |||
| } | |||
| float get(const std::vector<std::size_t>& indices) const { | |||
| std::size_t index = 0; | |||
| std::size_t stride = 1; | |||
| for (int i = shape.size() - 1; i >= 0; i--) { | |||
| index += indices[i] * stride; | |||
| stride *= shape[i]; | |||
| } | |||
| return data[index]; | |||
| } | |||
| void set(const std::vector<std::size_t>& indices, float value) { | |||
| std::size_t index = 0; | |||
| std::size_t stride = 1; | |||
| for (int i = shape.size() - 1; i >= 0; i--) { | |||
| index += indices[i] * stride; | |||
| stride *= shape[i]; | |||
| } | |||
| data[index] = value; | |||
| } | |||
| ~Tensor() = default; | |||
| }; // class Tensor | |||
| std::shared_ptr<Tensor> pyarray_to_tensor(py::array_t<float> array); | |||
| std::shared_ptr<Tensor> argmax(const std::shared_ptr<Tensor>& tensor, int axis); | |||
| std::shared_ptr<Tensor> mean(const std::shared_ptr<Tensor>& tensor); | |||
| std::shared_ptr<Tensor> exp(const std::shared_ptr<Tensor>& tensor); | |||
| } // namespace tensor | |||
| @@ -0,0 +1,117 @@ | |||
| #include <pybind11/pybind11.h> | |||
| #include <pybind11/stl.h> | |||
| #include "math/arith.h" | |||
| #include "operators/nn.h" | |||
| #include "tensor/tensor.h" | |||
| #include "operators/ops.h" | |||
| #include "operators/autodiff.h" | |||
| namespace py = pybind11; | |||
| PYBIND11_MODULE(uctc, m) { | |||
| py::module C = m.def_submodule("C", "C module"); | |||
| py::module arith = C.def_submodule("arith", "Arithmetic module"); | |||
| arith.def("sqrt", &arith::sqrt, "Square root function", py::arg("x") = 0.0); | |||
| py::class_<tensor::Tensor, std::shared_ptr<tensor::Tensor>>(m, "Tensor") | |||
| .def_readonly("shape", &tensor::Tensor::shape) | |||
| .def_readonly("size", &tensor::Tensor::size) | |||
| .def("data", &tensor::Tensor::get_data, "Get the data of the tensor", pybind11::return_value_policy::copy) | |||
| .def("transpose", &tensor::Tensor::transpose, "Transpose the tensor", pybind11::return_value_policy::copy); | |||
| py::module nn = m.def_submodule("nn", "Neural network module"); | |||
| py::class_<nn::Node, std::shared_ptr<nn::Node>>(nn, "Node") | |||
| .def("data", &nn::Node::get_data, "Get the data of the node", pybind11::return_value_policy::copy) | |||
| .def("tensor", &nn::Node::get_tensor, "Get the tensor of the node", pybind11::return_value_policy::automatic_reference); | |||
| py::class_<nn::DataNode, nn::Node, std::shared_ptr<nn::DataNode>>(nn, "DataNode"); | |||
| py::class_<nn::Parameter, nn::DataNode, std::shared_ptr<nn::Parameter>>(nn, "Parameter") | |||
| .def(pybind11::init<py::array_t<float>>(), "Create a parameter from an array.") | |||
| .def("update", &nn::Parameter::update, "Update the parameter node", py::arg("grad") = nullptr, py::arg("learning_rate") = 0.001); | |||
| py::class_<nn::Constant, nn::DataNode, std::shared_ptr<nn::Constant>>(nn, "Constant") | |||
| .def(pybind11::init<py::array_t<float>>(), "Create a constant node from a numpy array"); | |||
| py::class_<nn::FunctionNode, nn::Node, std::shared_ptr<nn::FunctionNode>>(nn, "FunctionNode"); | |||
| py::class_<nn::Add, nn::FunctionNode, std::shared_ptr<nn::Add>>(nn, "Add") | |||
| .def(py::init<std::shared_ptr<nn::Node>, std::shared_ptr<nn::Node>>(), "Create an add function node") | |||
| .def("forward", &nn::Add::forward, "Forward function"); | |||
| py::class_<nn::AddBias, nn::FunctionNode, std::shared_ptr<nn::AddBias>>(nn, "AddBias") | |||
| .def(py::init<std::shared_ptr<nn::Node>, std::shared_ptr<nn::Node>>(), "Create an add bias function node") | |||
| .def("forward", &nn::AddBias::forward, "Forward function") | |||
| .def("data", &nn::AddBias::get_data, "Get the data of the node", pybind11::return_value_policy::automatic_reference); | |||
| py::class_<nn::Linear, nn::FunctionNode, std::shared_ptr<nn::Linear>>(nn, "Linear") | |||
| .def(py::init<std::shared_ptr<nn::Node>, std::shared_ptr<nn::Node>>(), "Create a linear function node") | |||
| .def("forward", &nn::Linear::forward, "Forward function"); | |||
| py::class_<nn::ReLU, nn::FunctionNode, std::shared_ptr<nn::ReLU>>(nn, "ReLU") | |||
| .def(py::init<std::shared_ptr<nn::Node>>(), "Create a ReLU function node"); | |||
| py::class_<nn::Loss, nn::FunctionNode, std::shared_ptr<nn::Loss>>(nn, "Loss"); | |||
| py::class_<nn::SquareLoss, nn::Loss, std::shared_ptr<nn::SquareLoss>>(nn, "SquareLoss") | |||
| .def(py::init<std::shared_ptr<nn::Node>, std::shared_ptr<nn::Node>>(), "Create a square loss function node"); | |||
| py::class_<nn::SoftmaxLoss, nn::Loss, std::shared_ptr<nn::SoftmaxLoss>>(nn, "SoftmaxLoss") | |||
| .def(py::init<std::shared_ptr<nn::Node>, std::shared_ptr<nn::Node>>(), "Create a softmax loss function node"); | |||
| nn.def("log_softmax", &nn::log_softmax, "Log softmax function", py::arg("logits")); | |||
| nn.def("gradients", &nn::gradients, "Calculate the gradients", py::arg("loss") = nullptr, py::arg("nodes") = std::vector<std::shared_ptr<nn::Node>>{}); | |||
| nn.def("pyarray_to_tensor", &tensor::pyarray_to_tensor, "Convert a numpy array to a tensor", py::arg("arr")); | |||
| nn.def("argmax", &tensor::argmax, "Get a tensor's argmax", py::arg("tensor"), py::arg("axis")); | |||
| nn.def("mean", &tensor::mean, "Get a tensor element's mean value", py::arg("tensor")); | |||
| nn.def("exp", &tensor::exp, "Get exp of a tensor", py::arg("tensor")); | |||
| // framework test | |||
| py::module framework = m.def_submodule("framework", "Framework module"); | |||
| py::module basis = framework.def_submodule("basis", "Basic modules"); | |||
| // task 1 | |||
| basis.def("mul", &operators::mul<int>, "Multiply two integers", py::arg("a"), py::arg("b")); | |||
| basis.def("id", &operators::id<int>, "Identity function", py::arg("a")); | |||
| basis.def("add", &operators::add<int>, "Add two integers", py::arg("a"), py::arg("b")); | |||
| basis.def("neg", &operators::neg<int>, "Negate an integer", py::arg("a")); | |||
| basis.def("lt", &operators::lt<int>, "Less than operator", py::arg("a"), py::arg("b")); | |||
| basis.def("eq", &operators::eq<int>, "Equal operator", py::arg("a"), py::arg("b")); | |||
| basis.def("max", &operators::max<int>, "Max operator", py::arg("a"), py::arg("b")); | |||
| // task 2 | |||
| basis.def("is_close", &operators::is_close, "Check if two floats are close", py::arg("x"), py::arg("y")); | |||
| basis.def("sigmoid", &operators::sigmoid, "Sigmoid function", py::arg("x")); | |||
| basis.def("relu", &operators::relu, "ReLU function", py::arg("x")); | |||
| basis.def("inv", &operators::inv, "Inverse function", py::arg("x")); | |||
| basis.def("inv_back", &operators::inv_back, "Inv back function", py::arg("x"), py::arg("d")); | |||
| basis.def("relu_back", &operators::relu_back, "ReLU back function", py::arg("x"), py::arg("d")); | |||
| // task 3 | |||
| basis.def("negList", &operators::negList, "Negate a list of integers", py::arg("lst")); | |||
| // task 4, 5 | |||
| basis.def("addLists", &operators::addLists, "Add two lists of integers", py::arg("lst1"), py::arg("lst2")); | |||
| // task 6 | |||
| basis.def("sumList", &operators::sumList, "Sum a list of integers", py::arg("lst")); | |||
| // task 7 | |||
| basis.def("prodList", &operators::prodList, "Multiply a list of integers", py::arg("lst")); | |||
| py::module autodiff = framework.def_submodule("autodiff", "Autodiff modules"); | |||
| autodiff.def("test_central_difference", &autodiff::test_central_difference, "Test central difference"); | |||
| autodiff.def("test_addscalar", &autodiff::test_addscalar, "Test add scalar"); | |||
| autodiff.def("test_mulscalar", &autodiff::test_mulscalar, "Test mul scalar"); | |||
| autodiff.def("test_logscalar", &autodiff::test_logscalar, "Test log scalar"); | |||
| autodiff.def("test_invscalar", &autodiff::test_invscalar, "Test inv scalar"); | |||
| autodiff.def("test_sigmoidscalar", &autodiff::test_sigmoidscalar, "Test sigmoid scalar"); | |||
| } | |||
| @@ -0,0 +1,16 @@ | |||
| from uctc.framework import autodiff | |||
| import numpy as np | |||
| from functools import reduce | |||
| import random | |||
| lst = [autodiff.test_central_difference, autodiff.test_addscalar, autodiff.test_mulscalar, autodiff.test_logscalar, autodiff.test_invscalar, autodiff.test_sigmoidscalar] | |||
| for e in lst: | |||
| if e(): | |||
| print(f"\033[1;34mPassed: {e.__name__} passed all tests\033[0m") | |||
| else: | |||
| print(f"\033[1;31mError: {e.__name__} failed test... expects true but gets false\033[0m") | |||
| exit(0) | |||
| print(f"\033[1;32m[PASSED] Task 3 finished!\033[0m") | |||
| @@ -0,0 +1,2 @@ | |||
| # change this | |||
| lib_path = "/home/hexu/learn/uc-modern-cpp-student/cc/build/" | |||
| @@ -0,0 +1,46 @@ | |||
| import numpy as np | |||
| import math | |||
| from uctc.framework import basis | |||
| binary_arguments = [ | |||
| (1, 2), | |||
| (-2, 1), | |||
| (1, 1), | |||
| (2, -2), | |||
| (1, 3), | |||
| (3, 1), | |||
| (-3, 3), | |||
| (4, 5), | |||
| (5, 4), | |||
| (4, 4), | |||
| (5, 5) | |||
| ] | |||
| singular_arguments = [ | |||
| 1, 2, 4, -32, 42, 28, 0, 100, -1000, 10000, -100000 | |||
| ] | |||
| def iterate_binary_arguments(func, std_func): | |||
| for argument in binary_arguments: | |||
| if func(*argument) != std_func(*argument): | |||
| print(f"\033[1;31mError: {func.__name__}({argument}) = {func(*argument)} != {std_func.__name__}({argument}) = {std_func(*argument)}\033[0m") | |||
| exit(0) | |||
| print(f"\033[1;34mPassed: {func.__name__} passed all tests\033[0m") | |||
| return True | |||
| def iterate_singular_arguments(func, std_func): | |||
| for argument in singular_arguments: | |||
| if func(argument) != std_func(argument): | |||
| print(f"\033[1;31mError: {func.__name__}({argument}) = {func(argument)} != {std_func.__name__}({argument}) = {std_func(argument)}\033[0m") | |||
| exit(0) | |||
| print(f"\033[1;34mPassed: {func.__name__} passed all tests\033[0m") | |||
| return True | |||
| # Test task 1 | |||
| iterate_binary_arguments(basis.mul, lambda x, y: x * y) | |||
| iterate_singular_arguments(basis.id, lambda x: x) | |||
| iterate_binary_arguments(basis.add, lambda x, y: x + y) | |||
| iterate_singular_arguments(basis.neg, lambda x: -x) | |||
| iterate_binary_arguments(basis.lt, lambda x, y: int(x < y)) | |||
| iterate_binary_arguments(basis.eq, lambda x, y: int(x == y)) | |||
| iterate_binary_arguments(basis.max, lambda x, y: max(x, y)) | |||
| print(f"\033[1;32m[PASSED] Task 1 finished!\033[0m") | |||
| @@ -0,0 +1,55 @@ | |||
| from uctc.framework import basis | |||
| import numpy as np | |||
| import math | |||
| binary_arguments = [ | |||
| (1.0, 2.0), | |||
| (2.0, 1.0), | |||
| (-1.0, 1.0), | |||
| (2.0, -2.0), | |||
| (1.0, 3.0), | |||
| (3.0, -1.0), | |||
| (3.0, 3.0), | |||
| (-4.0, -5.0), | |||
| (5.0, 4.0), | |||
| (4.0, 4.0), | |||
| (5.0, 5.0) | |||
| ] | |||
| singular_arguments = [ | |||
| 1.0, -3.2, 4.3, 5.5, -6.7, 4.8, 3.33, 2.22, 1.11 | |||
| ] | |||
| def is_close(x, y): | |||
| return abs(x - y) < 1e-5 | |||
| def sigmoid(x): | |||
| if x >= 0: | |||
| return 1 / (1 + math.exp(-x)) | |||
| else: | |||
| return math.exp(x) / (1 + math.exp(x)) | |||
| def iterate_binary_arguments(func, std_func): | |||
| for argument in binary_arguments: | |||
| if not is_close(func(*argument), std_func(*argument)): | |||
| print(f"\033[1;31mError: {func.__name__}({argument}) = {func(*argument)} != {std_func.__name__}({argument}) = {std_func(*argument)}\033[0m") | |||
| exit(0) | |||
| print(f"\033[1;34mPassed: {func.__name__} passed all tests\033[0m") | |||
| return True | |||
| def iterate_singular_arguments(func, std_func): | |||
| for argument in singular_arguments: | |||
| if not is_close(func(argument), std_func(argument)): | |||
| print(f"\033[1;31mError: {func.__name__}({argument}) = {func(argument)} != {std_func.__name__}({argument}) = {std_func(argument)}\033[0m") | |||
| exit(0) | |||
| print(f"\033[1;34mPassed: {func.__name__} passed all tests\033[0m") | |||
| return True | |||
| # Test task 1 | |||
| iterate_binary_arguments(basis.is_close, lambda x, y: 1.0*int(is_close(x, y))) | |||
| iterate_singular_arguments(basis.sigmoid, lambda x: sigmoid(x)) | |||
| iterate_singular_arguments(basis.relu, lambda x: x if x > 0.0 else 0.0) | |||
| iterate_singular_arguments(basis.inv, lambda x: 1.0/x) | |||
| iterate_binary_arguments(basis.inv_back, lambda x, d: -d/(x*x)) | |||
| iterate_binary_arguments(basis.relu_back, lambda x, d: d * 1.0 if x > 0.0 else 0.0) | |||
| print(f"\033[1;32m[PASSED] Task 2 finished!\033[0m") | |||
| @@ -0,0 +1,20 @@ | |||
| from uctc.framework import basis | |||
| import numpy as np | |||
| import math | |||
| import random | |||
| def is_close(x, y): | |||
| return abs(x - y) < 1e-5 | |||
| arr = [random.random() for i in range(128)] | |||
| test_x = basis.negList(arr) | |||
| test_y = [-e for e in arr] | |||
| for i, (x, y) in enumerate(zip(test_x, test_y)): | |||
| if not is_close(x, y): | |||
| print(f"\033[1;31mError: {basis.negList.__name__} failed test at position {i}, expects {y} but gets {x}\033[0m") | |||
| exit(0) | |||
| print(f"\033[1;34mPassed: {basis.negList.__name__} passed all tests\033[0m") | |||
| print(f"\033[1;32m[PASSED] Task 3 finished!\033[0m") | |||
| @@ -0,0 +1,21 @@ | |||
| from uctc.framework import basis | |||
| import numpy as np | |||
| import math | |||
| import random | |||
| def is_close(x, y): | |||
| return abs(x - y) < 1e-5 | |||
| arr_a = [random.random() for i in range(128)] | |||
| arr_b = [random.random() for i in range(128)] | |||
| test_x = basis.addLists(arr_a, arr_b) | |||
| test_y = [e1 + e2 for e1, e2 in zip(arr_a, arr_b)] | |||
| for i, (x, y) in enumerate(zip(test_x, test_y)): | |||
| if not is_close(x, y): | |||
| print(f"\033[1;31mError: {basis.addLists.__name__} failed test at position {i}, expects {y} but gets {x}\033[0m") | |||
| exit(0) | |||
| print(f"\033[1;34mPassed: {basis.addLists.__name__} passed all tests\033[0m") | |||
| print(f"\033[1;32m[PASSED] Task 4 finished!\033[0m") | |||
| @@ -0,0 +1,30 @@ | |||
| from uctc.framework import basis | |||
| import numpy as np | |||
| from functools import reduce | |||
| import random | |||
| def is_close(x, y): | |||
| return abs(x - y) < 1e-3 | |||
| arr = [random.random() for i in range(128)] | |||
| test_x1 = basis.sumList(arr) | |||
| test_x2 = basis.prodList(arr) | |||
| test_y1 = reduce(lambda x, y: x + y, arr, 0.0) | |||
| test_y2 = reduce(lambda x, y: x * y, arr, 1.0) | |||
| if not is_close(test_x1, test_y1): | |||
| print(f"\033[1;31mError: {basis.sumList.__name__} failed test... expects {test_y1} but gets {test_x1}\033[0m") | |||
| exit(0) | |||
| print(f"\033[1;34mPassed: {basis.sumList.__name__} passed all tests\033[0m") | |||
| if not is_close(test_x2, test_y2): | |||
| print(f"\033[1;31mError: {basis.prodList.__name__} failed test... expects {test_y2} but gets {test_x2}\033[0m") | |||
| exit(0) | |||
| print(f"\033[1;34mPassed: {basis.prodList.__name__} passed all tests\033[0m") | |||
| print(f"\033[1;32m[PASSED] Task 3 finished!\033[0m") | |||
| @@ -0,0 +1,41 @@ | |||
| import numpy as np | |||
| import uctc.nn as nn | |||
| tensor1 = np.random.rand(42, 48) | |||
| tensor2 = nn.pyarray_to_tensor(tensor1) | |||
| t_tensor1 = tensor1.transpose() | |||
| t_tensor2 = tensor2.transpose() | |||
| t_2data = t_tensor2.data() | |||
| t_1data = t_tensor1.flatten().tolist() | |||
| def is_close(x, y): | |||
| return abs(x - y) < 1e-5 | |||
| for i in range(len(t_1data)): | |||
| if not is_close(t_1data[i], t_2data[i]): | |||
| print(f"\033[1;31mTask 13 Error: t1 data[{i}] != t2 data[{i}]\033[0m") | |||
| exit(0) | |||
| at2 = nn.argmax(tensor2, 0).data() | |||
| at1 = np.argmax(tensor1, 0).flatten().tolist() | |||
| for i in range(len(at1)): | |||
| if not is_close(at1[i], at2[i]): | |||
| print(f"\033[1;31mTask 14 Error: at1 data[{i}] != at2 data[{i}]\033[0m") | |||
| exit(0) | |||
| at4 = nn.argmax(tensor2, 1).data() | |||
| at3 = np.argmax(tensor1, 1).flatten().tolist() | |||
| for i in range(len(at1)): | |||
| if not is_close(at1[i], at2[i]): | |||
| print(f"\033[1;31mTask 14 Error: at3 data[{i}] != at4 data[{i}]\033[0m") | |||
| exit(0) | |||
| print(f"\033[1;32m[PASSED] Task 13-14 finished!\033[0m") | |||
| @@ -0,0 +1,579 @@ | |||
| # A custom autograder for this project | |||
| ################################################################################ | |||
| # A mini-framework for autograding | |||
| ################################################################################ | |||
| import optparse | |||
| import pickle | |||
| import random | |||
| import sys | |||
| import traceback | |||
| class WritableNull: | |||
| def write(self, string): | |||
| pass | |||
| def flush(self): | |||
| pass | |||
| class Tracker(object): | |||
| def __init__(self, questions, maxes, prereqs, mute_output): | |||
| self.questions = questions | |||
| self.maxes = maxes | |||
| self.prereqs = prereqs | |||
| self.points = {q: 0 for q in self.questions} | |||
| self.current_question = None | |||
| self.current_test = None | |||
| self.points_at_test_start = None | |||
| self.possible_points_remaining = None | |||
| self.mute_output = mute_output | |||
| self.original_stdout = None | |||
| self.muted = False | |||
| def mute(self): | |||
| if self.muted: | |||
| return | |||
| self.muted = True | |||
| self.original_stdout = sys.stdout | |||
| sys.stdout = WritableNull() | |||
| def unmute(self): | |||
| if not self.muted: | |||
| return | |||
| self.muted = False | |||
| sys.stdout = self.original_stdout | |||
| def begin_q(self, q): | |||
| assert q in self.questions | |||
| text = 'Question {}'.format(q) | |||
| print('\n' + text) | |||
| print('=' * len(text)) | |||
| for prereq in sorted(self.prereqs[q]): | |||
| if self.points[prereq] < self.maxes[prereq]: | |||
| print("""*** NOTE: Make sure to complete Question {} before working on Question {}, | |||
| *** because Question {} builds upon your answer for Question {}. | |||
| """.format(prereq, q, q, prereq)) | |||
| return False | |||
| self.current_question = q | |||
| self.possible_points_remaining = self.maxes[q] | |||
| return True | |||
| def begin_test(self, test_name): | |||
| self.current_test = test_name | |||
| self.points_at_test_start = self.points[self.current_question] | |||
| print("*** {}) {}".format(self.current_question, self.current_test)) | |||
| if self.mute_output: | |||
| self.mute() | |||
| def end_test(self, pts): | |||
| if self.mute_output: | |||
| self.unmute() | |||
| self.possible_points_remaining -= pts | |||
| if self.points[self.current_question] == self.points_at_test_start + pts: | |||
| print("*** PASS: {}".format(self.current_test)) | |||
| elif self.points[self.current_question] == self.points_at_test_start: | |||
| print("*** FAIL") | |||
| self.current_test = None | |||
| self.points_at_test_start = None | |||
| def end_q(self): | |||
| assert self.current_question is not None | |||
| assert self.possible_points_remaining == 0 | |||
| print('\n### Question {}: {}/{} ###'.format( | |||
| self.current_question, | |||
| self.points[self.current_question], | |||
| self.maxes[self.current_question])) | |||
| self.current_question = None | |||
| self.possible_points_remaining = None | |||
| def finalize(self): | |||
| import time | |||
| print('\nFinished at %d:%02d:%02d' % time.localtime()[3:6]) | |||
| print("\nProvisional grades\n==================") | |||
| for q in self.questions: | |||
| print('Question %s: %d/%d' % (q, self.points[q], self.maxes[q])) | |||
| print('------------------') | |||
| print('Total: %d/%d' % (sum(self.points.values()), | |||
| sum([self.maxes[q] for q in self.questions]))) | |||
| print(""" | |||
| Your grades are NOT yet registered. To register your grades, make sure | |||
| to follow your instructor's guidelines to receive credit on your project. | |||
| """) | |||
| def add_points(self, pts): | |||
| self.points[self.current_question] += pts | |||
| TESTS = [] | |||
| PREREQS = {} | |||
| def add_prereq(q, pre): | |||
| if isinstance(pre, str): | |||
| pre = [pre] | |||
| if q not in PREREQS: | |||
| PREREQS[q] = set() | |||
| PREREQS[q] |= set(pre) | |||
| def test(q, points): | |||
| def deco(fn): | |||
| TESTS.append((q, points, fn)) | |||
| return fn | |||
| return deco | |||
| def parse_options(argv): | |||
| parser = optparse.OptionParser(description = 'Run public tests on student code') | |||
| parser.set_defaults( | |||
| edx_output=False, | |||
| gs_output=False, | |||
| no_graphics=False, | |||
| mute_output=False, | |||
| check_dependencies=False, | |||
| ) | |||
| parser.add_option('--edx-output', | |||
| dest = 'edx_output', | |||
| action = 'store_true', | |||
| help = 'Ignored, present for compatibility only') | |||
| parser.add_option('--gradescope-output', | |||
| dest = 'gs_output', | |||
| action = 'store_true', | |||
| help = 'Ignored, present for compatibility only') | |||
| parser.add_option('--question', '-q', | |||
| dest = 'grade_question', | |||
| default = None, | |||
| help = 'Grade only one question (e.g. `-q q1`)') | |||
| parser.add_option('--no-graphics', | |||
| dest = 'no_graphics', | |||
| action = 'store_true', | |||
| help = 'Do not display graphics (visualizing your implementation is highly recommended for debugging).') | |||
| parser.add_option('--mute', | |||
| dest = 'mute_output', | |||
| action = 'store_true', | |||
| help = 'Mute output from executing tests') | |||
| parser.add_option('--check-dependencies', | |||
| dest = 'check_dependencies', | |||
| action = 'store_true', | |||
| help = 'check that numpy and matplotlib are installed') | |||
| (options, args) = parser.parse_args(argv) | |||
| return options | |||
| def main(): | |||
| options = parse_options(sys.argv) | |||
| if options.check_dependencies: | |||
| check_dependencies() | |||
| return | |||
| if options.no_graphics: | |||
| disable_graphics() | |||
| questions = set() | |||
| maxes = {} | |||
| for q, points, fn in TESTS: | |||
| questions.add(q) | |||
| maxes[q] = maxes.get(q, 0) + points | |||
| if q not in PREREQS: | |||
| PREREQS[q] = set() | |||
| questions = list(sorted(questions)) | |||
| if options.grade_question: | |||
| if options.grade_question not in questions: | |||
| print("ERROR: question {} does not exist".format(options.grade_question)) | |||
| sys.exit(1) | |||
| else: | |||
| questions = [options.grade_question] | |||
| PREREQS[options.grade_question] = set() | |||
| tracker = Tracker(questions, maxes, PREREQS, options.mute_output) | |||
| for q in questions: | |||
| started = tracker.begin_q(q) | |||
| if not started: | |||
| continue | |||
| for testq, points, fn in TESTS: | |||
| if testq != q: | |||
| continue | |||
| tracker.begin_test(fn.__name__) | |||
| try: | |||
| fn(tracker) | |||
| except KeyboardInterrupt: | |||
| tracker.unmute() | |||
| print("\n\nCaught KeyboardInterrupt: aborting autograder") | |||
| tracker.finalize() | |||
| print("\n[autograder was interrupted before finishing]") | |||
| sys.exit(1) | |||
| except: | |||
| tracker.unmute() | |||
| print(traceback.format_exc()) | |||
| tracker.end_test(points) | |||
| tracker.end_q() | |||
| tracker.finalize() | |||
| ################################################################################ | |||
| # Tests begin here | |||
| ################################################################################ | |||
| import numpy as np | |||
| import matplotlib | |||
| import contextlib | |||
| import nn | |||
| import backend | |||
| def check_dependencies(): | |||
| import matplotlib.pyplot as plt | |||
| import time | |||
| fig, ax = plt.subplots(1, 1) | |||
| ax.set_xlim([-1, 1]) | |||
| ax.set_ylim([-1, 1]) | |||
| line, = ax.plot([], [], color="black") | |||
| plt.show(block=False) | |||
| for t in range(400): | |||
| angle = t * 0.05 | |||
| x = np.sin(angle) | |||
| y = np.cos(angle) | |||
| line.set_data([x,-x], [y,-y]) | |||
| fig.canvas.draw_idle() | |||
| fig.canvas.start_event_loop(1e-3) | |||
| def disable_graphics(): | |||
| backend.use_graphics = False | |||
| @contextlib.contextmanager | |||
| def no_graphics(): | |||
| old_use_graphics = backend.use_graphics | |||
| backend.use_graphics = False | |||
| yield | |||
| backend.use_graphics = old_use_graphics | |||
| def verify_node(node, expected_type, expected_shape, method_name): | |||
| if expected_type == 'parameter': | |||
| assert node is not None, ( | |||
| "{} should return an instance of nn.Parameter, not None".format(method_name)) | |||
| assert isinstance(node, nn.Parameter), ( | |||
| "{} should return an instance of nn.Parameter, instead got type {!r}".format( | |||
| method_name, type(node).__name__)) | |||
| elif expected_type == 'loss': | |||
| assert node is not None, ( | |||
| "{} should return an instance a loss node, not None".format(method_name)) | |||
| assert isinstance(node, (nn.SquareLoss, nn.SoftmaxLoss)), ( | |||
| "{} should return a loss node, instead got type {!r}".format( | |||
| method_name, type(node).__name__)) | |||
| elif expected_type == 'node': | |||
| assert node is not None, ( | |||
| "{} should return a node object, not None".format(method_name)) | |||
| assert isinstance(node, nn.Node), ( | |||
| "{} should return a node object, instead got type {!r}".format( | |||
| method_name, type(node).__name__)) | |||
| else: | |||
| assert False, "If you see this message, please report a bug in the autograder" | |||
| if expected_type != 'loss': | |||
| assert all([(expected is '?' or actual == expected) for (actual, expected) in zip(node.data.shape, expected_shape)]), ( | |||
| "{} should return an object with shape {}, got {}".format( | |||
| method_name, nn.format_shape(expected_shape), nn.format_shape(node.data.shape))) | |||
| def trace_node(node_to_trace): | |||
| """ | |||
| Returns a set containing the node and all ancestors in the computation graph | |||
| """ | |||
| nodes = set() | |||
| tape = [] | |||
| def visit(node): | |||
| if node not in nodes: | |||
| for parent in node.parents: | |||
| visit(parent) | |||
| nodes.add(node) | |||
| tape.append(node) | |||
| visit(node_to_trace) | |||
| return nodes | |||
| @test('q1', points=6) | |||
| def check_perceptron(tracker): | |||
| import models | |||
| print("Sanity checking perceptron...") | |||
| np_random = np.random.RandomState(0) | |||
| # Check that the perceptron weights are initialized to a vector with `dimensions` entries. | |||
| for dimensions in range(1, 10): | |||
| p = models.PerceptronModel(dimensions) | |||
| p_weights = p.get_weights() | |||
| verify_node(p_weights, 'parameter', (1, dimensions), "PerceptronModel.get_weights()") | |||
| # Check that run returns a node, and that the score in the node is correct | |||
| for dimensions in range(1, 10): | |||
| p = models.PerceptronModel(dimensions) | |||
| p_weights = p.get_weights() | |||
| verify_node(p_weights, 'parameter', (1, dimensions), "PerceptronModel.get_weights()") | |||
| point = np_random.uniform(-10, 10, (1, dimensions)) | |||
| score = p.run(nn.Constant(point)) | |||
| verify_node(score, 'node', (1, 1), "PerceptronModel.run()") | |||
| calculated_score = nn.as_scalar(score) | |||
| expected_score = float(np.dot(point.flatten(), p_weights.data.flatten())) | |||
| assert np.isclose(calculated_score, expected_score), ( | |||
| "The score computed by PerceptronModel.run() ({:.4f}) does not match the expected score ({:.4f})".format( | |||
| calculated_score, expected_score)) | |||
| # Check that get_prediction returns the correct values, including the | |||
| # case when a point lies exactly on the decision boundary | |||
| for dimensions in range(1, 10): | |||
| p = models.PerceptronModel(dimensions) | |||
| random_point = np_random.uniform(-10, 10, (1, dimensions)) | |||
| for point in (random_point, np.zeros_like(random_point)): | |||
| prediction = p.get_prediction(nn.Constant(point)) | |||
| assert prediction == 1 or prediction == -1, ( | |||
| "PerceptronModel.get_prediction() should return 1 or -1, not {}".format( | |||
| prediction)) | |||
| expected_prediction = np.asscalar(np.where(np.dot(point, p.get_weights().data.T) >= 0, 1, -1)) | |||
| assert prediction == expected_prediction, ( | |||
| "PerceptronModel.get_prediction() returned {}; expected {}".format( | |||
| prediction, expected_prediction)) | |||
| tracker.add_points(2) # Partial credit for passing sanity checks | |||
| print("Sanity checking perceptron weight updates...") | |||
| # Test weight updates. This involves constructing a dataset that | |||
| # requires 0 or 1 updates before convergence, and testing that weight | |||
| # values change as expected. Note that (multiplier < -1 or multiplier > 1) | |||
| # must be true for the testing code to be correct. | |||
| dimensions = 2 | |||
| for multiplier in (-5, -2, 2, 5): | |||
| p = models.PerceptronModel(dimensions) | |||
| orig_weights = p.get_weights().data.reshape((1, dimensions)).copy() | |||
| if np.abs(orig_weights).sum() == 0.0: | |||
| # This autograder test doesn't work when weights are exactly zero | |||
| continue | |||
| point = multiplier * orig_weights | |||
| sanity_dataset = backend.Dataset( | |||
| x=np.tile(point, (500, 1)), | |||
| y=np.ones((500, 1)) * -1.0 | |||
| ) | |||
| p.train(sanity_dataset) | |||
| new_weights = p.get_weights().data.reshape((1, dimensions)) | |||
| if multiplier < 0: | |||
| expected_weights = orig_weights | |||
| else: | |||
| expected_weights = orig_weights - point | |||
| if not np.all(new_weights == expected_weights): | |||
| print() | |||
| print("Initial perceptron weights were: [{:.4f}, {:.4f}]".format( | |||
| orig_weights[0,0], orig_weights[0,1])) | |||
| print("All data points in the dataset were identical and had:") | |||
| print(" x = [{:.4f}, {:.4f}]".format( | |||
| point[0,0], point[0,1])) | |||
| print(" y = -1") | |||
| print("Your trained weights were: [{:.4f}, {:.4f}]".format( | |||
| new_weights[0,0], new_weights[0,1])) | |||
| print("Expected weights after training: [{:.4f}, {:.4f}]".format( | |||
| expected_weights[0,0], expected_weights[0,1])) | |||
| print() | |||
| assert False, "Weight update sanity check failed" | |||
| print("Sanity checking complete. Now training perceptron") | |||
| model = models.PerceptronModel(3) | |||
| dataset = backend.PerceptronDataset(model) | |||
| model.train(dataset) | |||
| backend.maybe_sleep_and_close(1) | |||
| assert dataset.epoch != 0, "Perceptron code never iterated over the training data" | |||
| accuracy = np.mean(np.where(np.dot(dataset.x, model.get_weights().data.T) >= 0.0, 1.0, -1.0) == dataset.y) | |||
| if accuracy < 1.0: | |||
| print("The weights learned by your perceptron correctly classified {:.2%} of training examples".format(accuracy)) | |||
| print("To receive full points for this question, your perceptron must converge to 100% accuracy") | |||
| return | |||
| tracker.add_points(4) | |||
| @test('q2', points=6) | |||
| def check_regression(tracker): | |||
| import models | |||
| model = models.RegressionModel() | |||
| dataset = backend.RegressionDataset(model) | |||
| detected_parameters = None | |||
| for batch_size in (1, 2, 4): | |||
| inp_x = nn.Constant(dataset.x[:batch_size]) | |||
| inp_y = nn.Constant(dataset.y[:batch_size]) | |||
| output_node = model.run(inp_x) | |||
| verify_node(output_node, 'node', (batch_size, 1), "RegressionModel.run()") | |||
| trace = trace_node(output_node) | |||
| assert inp_x in trace, "Node returned from RegressionModel.run() does not depend on the provided input (x)" | |||
| if detected_parameters is None: | |||
| detected_parameters = [node for node in trace if isinstance(node, nn.Parameter)] | |||
| for node in trace: | |||
| assert not isinstance(node, nn.Parameter) or node in detected_parameters, ( | |||
| "Calling RegressionModel.run() multiple times should always re-use the same parameters, but a new nn.Parameter object was detected") | |||
| for batch_size in (1, 2, 4): | |||
| inp_x = nn.Constant(dataset.x[:batch_size]) | |||
| inp_y = nn.Constant(dataset.y[:batch_size]) | |||
| loss_node = model.get_loss(inp_x, inp_y) | |||
| verify_node(loss_node, 'loss', None, "RegressionModel.get_loss()") | |||
| trace = trace_node(loss_node) | |||
| assert inp_x in trace, "Node returned from RegressionModel.get_loss() does not depend on the provided input (x)" | |||
| assert inp_y in trace, "Node returned from RegressionModel.get_loss() does not depend on the provided labels (y)" | |||
| for node in trace: | |||
| assert not isinstance(node, nn.Parameter) or node in detected_parameters, ( | |||
| "RegressionModel.get_loss() should not use additional parameters not used by RegressionModel.run()") | |||
| tracker.add_points(2) # Partial credit for passing sanity checks | |||
| model.train(dataset) | |||
| backend.maybe_sleep_and_close(1) | |||
| train_loss = model.get_loss(nn.Constant(dataset.x), nn.Constant(dataset.y)) | |||
| verify_node(train_loss, 'loss', None, "RegressionModel.get_loss()") | |||
| train_loss = nn.as_scalar(train_loss) | |||
| # Re-compute the loss ourselves: otherwise get_loss() could be hard-coded | |||
| # to always return zero | |||
| train_predicted = model.run(nn.Constant(dataset.x)) | |||
| verify_node(train_predicted, 'node', (dataset.x.shape[0], 1), "RegressionModel.run()") | |||
| sanity_loss = 0.5 * np.mean((train_predicted.data - dataset.y)**2) | |||
| assert np.isclose(train_loss, sanity_loss), ( | |||
| "RegressionModel.get_loss() returned a loss of {:.4f}, " | |||
| "but the autograder computed a loss of {:.4f} " | |||
| "based on the output of RegressionModel.run()".format( | |||
| train_loss, sanity_loss)) | |||
| loss_threshold = 0.02 | |||
| if train_loss <= loss_threshold: | |||
| print("Your final loss is: {:f}".format(train_loss)) | |||
| tracker.add_points(4) | |||
| else: | |||
| print("Your final loss ({:f}) must be no more than {:.4f} to receive full points for this question".format(train_loss, loss_threshold)) | |||
| @test('q3', points=6) | |||
| def check_digit_classification(tracker): | |||
| import models | |||
| model = models.DigitClassificationModel() | |||
| dataset = backend.DigitClassificationDataset(model) | |||
| detected_parameters = None | |||
| for batch_size in (1, 2, 4): | |||
| inp_x = nn.Constant(dataset.x[:batch_size]) | |||
| inp_y = nn.Constant(dataset.y[:batch_size]) | |||
| output_node = model.run(inp_x) | |||
| verify_node(output_node, 'node', (batch_size, 10), "DigitClassificationModel.run()") | |||
| trace = trace_node(output_node) | |||
| assert inp_x in trace, "Node returned from DigitClassificationModel.run() does not depend on the provided input (x)" | |||
| if detected_parameters is None: | |||
| detected_parameters = [node for node in trace if isinstance(node, nn.Parameter)] | |||
| for node in trace: | |||
| assert not isinstance(node, nn.Parameter) or node in detected_parameters, ( | |||
| "Calling DigitClassificationModel.run() multiple times should always re-use the same parameters, but a new nn.Parameter object was detected") | |||
| for batch_size in (1, 2, 4): | |||
| inp_x = nn.Constant(dataset.x[:batch_size]) | |||
| inp_y = nn.Constant(dataset.y[:batch_size]) | |||
| loss_node = model.get_loss(inp_x, inp_y) | |||
| verify_node(loss_node, 'loss', None, "DigitClassificationModel.get_loss()") | |||
| trace = trace_node(loss_node) | |||
| assert inp_x in trace, "Node returned from DigitClassificationModel.get_loss() does not depend on the provided input (x)" | |||
| assert inp_y in trace, "Node returned from DigitClassificationModel.get_loss() does not depend on the provided labels (y)" | |||
| for node in trace: | |||
| assert not isinstance(node, nn.Parameter) or node in detected_parameters, ( | |||
| "DigitClassificationModel.get_loss() should not use additional parameters not used by DigitClassificationModel.run()") | |||
| tracker.add_points(2) # Partial credit for passing sanity checks | |||
| model.train(dataset) | |||
| test_logits = model.run(nn.Constant(dataset.test_images)).data | |||
| test_predicted = np.argmax(test_logits, axis=1) | |||
| test_accuracy = np.mean(test_predicted == dataset.test_labels) | |||
| accuracy_threshold = 0.97 | |||
| if test_accuracy >= accuracy_threshold: | |||
| print("Your final test set accuracy is: {:%}".format(test_accuracy)) | |||
| tracker.add_points(4) | |||
| else: | |||
| print("Your final test set accuracy ({:%}) must be at least {:.0%} to receive full points for this question".format(test_accuracy, accuracy_threshold)) | |||
| @test('q4', points=7) | |||
| def check_lang_id(tracker): | |||
| import models | |||
| model = models.LanguageIDModel() | |||
| dataset = backend.LanguageIDDataset(model) | |||
| detected_parameters = None | |||
| for batch_size, word_length in ((1, 1), (2, 1), (2, 6), (4, 8)): | |||
| start = dataset.dev_buckets[-1, 0] | |||
| end = start + batch_size | |||
| inp_xs, inp_y = dataset._encode(dataset.dev_x[start:end], dataset.dev_y[start:end]) | |||
| inp_xs = inp_xs[:word_length] | |||
| output_node = model.run(inp_xs) | |||
| verify_node(output_node, 'node', (batch_size, len(dataset.language_names)), "LanguageIDModel.run()") | |||
| trace = trace_node(output_node) | |||
| for inp_x in inp_xs: | |||
| assert inp_x in trace, "Node returned from LanguageIDModel.run() does not depend on all of the provided inputs (xs)" | |||
| # Word length 1 does not use parameters related to transferring the | |||
| # hidden state across timesteps, so initial parameter detection is only | |||
| # run for longer words | |||
| if word_length > 1: | |||
| if detected_parameters is None: | |||
| detected_parameters = [node for node in trace if isinstance(node, nn.Parameter)] | |||
| for node in trace: | |||
| assert not isinstance(node, nn.Parameter) or node in detected_parameters, ( | |||
| "Calling LanguageIDModel.run() multiple times should always re-use the same parameters, but a new nn.Parameter object was detected") | |||
| for batch_size, word_length in ((1, 1), (2, 1), (2, 6), (4, 8)): | |||
| start = dataset.dev_buckets[-1, 0] | |||
| end = start + batch_size | |||
| inp_xs, inp_y = dataset._encode(dataset.dev_x[start:end], dataset.dev_y[start:end]) | |||
| inp_xs = inp_xs[:word_length] | |||
| loss_node = model.get_loss(inp_xs, inp_y) | |||
| trace = trace_node(loss_node) | |||
| for inp_x in inp_xs: | |||
| assert inp_x in trace, "Node returned from LanguageIDModel.run() does not depend on all of the provided inputs (xs)" | |||
| assert inp_y in trace, "Node returned from LanguageIDModel.get_loss() does not depend on the provided labels (y)" | |||
| for node in trace: | |||
| assert not isinstance(node, nn.Parameter) or node in detected_parameters, ( | |||
| "LanguageIDModel.get_loss() should not use additional parameters not used by LanguageIDModel.run()") | |||
| tracker.add_points(2) # Partial credit for passing sanity checks | |||
| model.train(dataset) | |||
| test_predicted_probs, test_predicted, test_correct = dataset._predict('test') | |||
| test_accuracy = np.mean(test_predicted == test_correct) | |||
| accuracy_threshold = 0.81 | |||
| if test_accuracy >= accuracy_threshold: | |||
| print("Your final test set accuracy is: {:%}".format(test_accuracy)) | |||
| tracker.add_points(5) | |||
| else: | |||
| print("Your final test set accuracy ({:%}) must be at least {:.0%} to receive full points for this question".format(test_accuracy, accuracy_threshold)) | |||
| if __name__ == '__main__': | |||
| main() | |||
| @@ -0,0 +1,449 @@ | |||
| import collections | |||
| import os | |||
| import time | |||
| import os | |||
| import matplotlib.pyplot as plt | |||
| import numpy as np | |||
| import nn | |||
| use_graphics = True | |||
| def maybe_sleep_and_close(seconds): | |||
| if use_graphics and plt.get_fignums(): | |||
| time.sleep(seconds) | |||
| for fignum in plt.get_fignums(): | |||
| fig = plt.figure(fignum) | |||
| plt.close(fig) | |||
| try: | |||
| # This raises a TclError on some Windows machines | |||
| fig.canvas.start_event_loop(1e-3) | |||
| except: | |||
| pass | |||
| def get_data_path(filename): | |||
| path = os.path.join( | |||
| os.path.dirname(__file__), os.pardir, "data", filename) | |||
| if not os.path.exists(path): | |||
| path = os.path.join( | |||
| os.path.dirname(__file__), "data", filename) | |||
| if not os.path.exists(path): | |||
| path = os.path.join( | |||
| os.path.dirname(__file__), filename) | |||
| if not os.path.exists(path): | |||
| raise Exception("Could not find data file: {}".format(filename)) | |||
| return path | |||
| class Dataset(object): | |||
| def __init__(self, x, y): | |||
| assert isinstance(x, np.ndarray) | |||
| assert isinstance(y, np.ndarray) | |||
| assert np.issubdtype(x.dtype, np.floating) | |||
| assert np.issubdtype(y.dtype, np.floating) | |||
| assert x.ndim == 2 | |||
| assert y.ndim == 2 | |||
| assert x.shape[0] == y.shape[0] | |||
| self.x = x | |||
| self.y = y | |||
| def iterate_once(self, batch_size): | |||
| assert isinstance(batch_size, int) and batch_size > 0, ( | |||
| "Batch size should be a positive integer, got {!r}".format( | |||
| batch_size)) | |||
| assert self.x.shape[0] % batch_size == 0, ( | |||
| "Dataset size {:d} is not divisible by batch size {:d}".format( | |||
| self.x.shape[0], batch_size)) | |||
| index = 0 | |||
| while index < self.x.shape[0]: | |||
| x = self.x[index:index + batch_size] | |||
| y = self.y[index:index + batch_size] | |||
| yield nn.Constant(x), nn.Constant(y) | |||
| index += batch_size | |||
| def iterate_forever(self, batch_size): | |||
| while True: | |||
| yield from self.iterate_once(batch_size) | |||
| def get_validation_accuracy(self): | |||
| raise NotImplementedError( | |||
| "No validation data is available for this dataset. " | |||
| "In this assignment, only the Digit Classification and Language " | |||
| "Identification datasets have validation data.") | |||
| class PerceptronDataset(Dataset): | |||
| def __init__(self, model): | |||
| points = 500 | |||
| x = np.hstack([np.random.randn(points, 2), np.ones((points, 1))]) | |||
| y = np.where(x[:, 0] + 2 * x[:, 1] - 1 >= 0, 1.0, -1.0) | |||
| super().__init__(x, np.expand_dims(y, axis=1)) | |||
| self.model = model | |||
| self.epoch = 0 | |||
| if use_graphics: | |||
| fig, ax = plt.subplots(1, 1) | |||
| limits = np.array([-3.0, 3.0]) | |||
| ax.set_xlim(limits) | |||
| ax.set_ylim(limits) | |||
| positive = ax.scatter(*x[y == 1, :-1].T, color="red", marker="+") | |||
| negative = ax.scatter(*x[y == -1, :-1].T, color="blue", marker="_") | |||
| line, = ax.plot([], [], color="black") | |||
| text = ax.text(0.03, 0.97, "", transform=ax.transAxes, va="top") | |||
| ax.legend([positive, negative], [1, -1]) | |||
| plt.show(block=False) | |||
| self.fig = fig | |||
| self.limits = limits | |||
| self.line = line | |||
| self.text = text | |||
| self.last_update = time.time() | |||
| def iterate_once(self, batch_size): | |||
| self.epoch += 1 | |||
| for i, (x, y) in enumerate(super().iterate_once(batch_size)): | |||
| yield x, y | |||
| if use_graphics and time.time() - self.last_update > 0.01: | |||
| w = self.model.get_weights().data.flatten() | |||
| limits = self.limits | |||
| if w[1] != 0: | |||
| self.line.set_data(limits, (-w[0] * limits - w[2]) / w[1]) | |||
| elif w[0] != 0: | |||
| self.line.set_data(np.full(2, -w[2] / w[0]), limits) | |||
| else: | |||
| self.line.set_data([], []) | |||
| self.text.set_text( | |||
| "epoch: {:,}\npoint: {:,}/{:,}\nweights: {}".format( | |||
| self.epoch, i * batch_size + 1, len(self.x), w)) | |||
| self.fig.canvas.draw_idle() | |||
| self.fig.canvas.start_event_loop(1e-3) | |||
| self.last_update = time.time() | |||
| class RegressionDataset(Dataset): | |||
| def __init__(self, model): | |||
| x = np.expand_dims(np.linspace(-2 * np.pi, 2 * np.pi, num=200), axis=1) | |||
| np.random.RandomState(0).shuffle(x) | |||
| self.argsort_x = np.argsort(x.flatten()) | |||
| y = np.sin(x) | |||
| super().__init__(x, y) | |||
| self.model = model | |||
| self.processed = 0 | |||
| if use_graphics: | |||
| fig, ax = plt.subplots(1, 1) | |||
| ax.set_xlim(-2 * np.pi, 2 * np.pi) | |||
| ax.set_ylim(-1.4, 1.4) | |||
| real, = ax.plot(x[self.argsort_x], y[self.argsort_x], color="blue") | |||
| learned, = ax.plot([], [], color="red") | |||
| text = ax.text(0.03, 0.97, "", transform=ax.transAxes, va="top") | |||
| ax.legend([real, learned], ["real", "learned"]) | |||
| plt.show(block=False) | |||
| self.fig = fig | |||
| self.learned = learned | |||
| self.text = text | |||
| self.last_update = time.time() | |||
| def iterate_once(self, batch_size): | |||
| for x, y in super().iterate_once(batch_size): | |||
| yield x, y | |||
| self.processed += batch_size | |||
| if use_graphics and time.time() - self.last_update > 0.1: | |||
| predicted = self.model.run(nn.Constant(self.x)).data | |||
| loss = self.model.get_loss( | |||
| nn.Constant(self.x), nn.Constant(self.y)).data | |||
| self.learned.set_data(self.x[self.argsort_x], predicted[self.argsort_x]) | |||
| self.text.set_text("processed: {:,}\nloss: {:.6f}".format( | |||
| self.processed, loss)) | |||
| self.fig.canvas.draw_idle() | |||
| self.fig.canvas.start_event_loop(1e-3) | |||
| self.last_update = time.time() | |||
| class DigitClassificationDataset(Dataset): | |||
| def __init__(self, model): | |||
| mnist_path = get_data_path("mnist.npz") | |||
| with np.load(mnist_path) as data: | |||
| train_images = data["train_images"] | |||
| train_labels = data["train_labels"] | |||
| test_images = data["test_images"] | |||
| test_labels = data["test_labels"] | |||
| assert len(train_images) == len(train_labels) == 60000 | |||
| assert len(test_images) == len(test_labels) == 10000 | |||
| self.dev_images = test_images[0::2] | |||
| self.dev_labels = test_labels[0::2] | |||
| self.test_images = test_images[1::2] | |||
| self.test_labels = test_labels[1::2] | |||
| train_labels_one_hot = np.zeros((len(train_images), 10)) | |||
| train_labels_one_hot[range(len(train_images)), train_labels] = 1 | |||
| super().__init__(train_images, train_labels_one_hot) | |||
| self.model = model | |||
| self.epoch = 0 | |||
| if use_graphics: | |||
| width = 20 # Width of each row expressed as a multiple of image width | |||
| samples = 100 # Number of images to display per label | |||
| fig = plt.figure() | |||
| ax = {} | |||
| images = collections.defaultdict(list) | |||
| texts = collections.defaultdict(list) | |||
| for i in reversed(range(10)): | |||
| ax[i] = plt.subplot2grid((30, 1), (3 * i, 0), 2, 1, | |||
| sharex=ax.get(9)) | |||
| plt.setp(ax[i].get_xticklabels(), visible=i == 9) | |||
| ax[i].set_yticks([]) | |||
| ax[i].text(-0.03, 0.5, i, transform=ax[i].transAxes, | |||
| va="center") | |||
| ax[i].set_xlim(0, 28 * width) | |||
| ax[i].set_ylim(0, 28) | |||
| for j in range(samples): | |||
| images[i].append(ax[i].imshow( | |||
| np.zeros((28, 28)), vmin=0, vmax=1, cmap="Greens", | |||
| alpha=0.3)) | |||
| texts[i].append(ax[i].text( | |||
| 0, 0, "", ha="center", va="top", fontsize="smaller")) | |||
| ax[9].set_xticks(np.linspace(0, 28 * width, 11)) | |||
| ax[9].set_xticklabels( | |||
| ["{:.1f}".format(num) for num in np.linspace(0, 1, 11)]) | |||
| ax[9].tick_params(axis="x", pad=16) | |||
| ax[9].set_xlabel("Probability of Correct Label") | |||
| status = ax[0].text( | |||
| 0.5, 1.5, "", transform=ax[0].transAxes, ha="center", | |||
| va="bottom") | |||
| plt.show(block=False) | |||
| self.width = width | |||
| self.samples = samples | |||
| self.fig = fig | |||
| self.images = images | |||
| self.texts = texts | |||
| self.status = status | |||
| self.last_update = time.time() | |||
| def iterate_once(self, batch_size): | |||
| self.epoch += 1 | |||
| for i, (x, y) in enumerate(super().iterate_once(batch_size)): | |||
| yield x, y | |||
| if use_graphics and time.time() - self.last_update > 1: | |||
| dev_logits = self.model.run(nn.Constant(self.dev_images)).data | |||
| dev_predicted = np.argmax(dev_logits, axis=1) | |||
| dev_probs = np.exp(nn.SoftmaxLoss.log_softmax(dev_logits)) | |||
| dev_accuracy = np.mean(dev_predicted == self.dev_labels) | |||
| self.status.set_text( | |||
| "epoch: {:d}, batch: {:d}/{:d}, validation accuracy: " | |||
| "{:.2%}".format( | |||
| self.epoch, i, len(self.x) // batch_size, dev_accuracy)) | |||
| for i in range(10): | |||
| predicted = dev_predicted[self.dev_labels == i] | |||
| probs = dev_probs[self.dev_labels == i][:, i] | |||
| linspace = np.linspace( | |||
| 0, len(probs) - 1, self.samples).astype(int) | |||
| indices = probs.argsort()[linspace] | |||
| for j, (prob, image) in enumerate(zip( | |||
| probs[indices], | |||
| self.dev_images[self.dev_labels == i][indices])): | |||
| self.images[i][j].set_data(image.reshape((28, 28))) | |||
| left = prob * (self.width - 1) * 28 | |||
| if predicted[indices[j]] == i: | |||
| self.images[i][j].set_cmap("Greens") | |||
| self.texts[i][j].set_text("") | |||
| else: | |||
| self.images[i][j].set_cmap("Reds") | |||
| self.texts[i][j].set_text(predicted[indices[j]]) | |||
| self.texts[i][j].set_x(left + 14) | |||
| self.images[i][j].set_extent([left, left + 28, 0, 28]) | |||
| self.fig.canvas.draw_idle() | |||
| self.fig.canvas.start_event_loop(1e-3) | |||
| self.last_update = time.time() | |||
| def get_validation_accuracy(self): | |||
| # print(self.dev_images[:2].tolist()) | |||
| dev_logits = self.model.run(nn.Constant(self.dev_images)).data | |||
| # print(f"dev logits: {dev_logits.flatten()[10:20]}") | |||
| dev_predicted = np.argmax(dev_logits, axis=1) | |||
| dev_accuracy = np.mean(dev_predicted == self.dev_labels) | |||
| return dev_accuracy | |||
| class LanguageIDDataset(Dataset): | |||
| def __init__(self, model): | |||
| self.model = model | |||
| data_path = get_data_path("lang_id.npz") | |||
| with np.load(data_path) as data: | |||
| self.chars = data['chars'] | |||
| self.language_codes = data['language_codes'] | |||
| self.language_names = data['language_names'] | |||
| self.train_x = data['train_x'] | |||
| self.train_y = data['train_y'] | |||
| self.train_buckets = data['train_buckets'] | |||
| self.dev_x = data['dev_x'] | |||
| self.dev_y = data['dev_y'] | |||
| self.dev_buckets = data['dev_buckets'] | |||
| self.test_x = data['test_x'] | |||
| self.test_y = data['test_y'] | |||
| self.test_buckets = data['test_buckets'] | |||
| self.epoch = 0 | |||
| self.bucket_weights = self.train_buckets[:,1] - self.train_buckets[:,0] | |||
| self.bucket_weights = self.bucket_weights / float(self.bucket_weights.sum()) | |||
| self.chars_print = self.chars | |||
| try: | |||
| print(u"Alphabet: {}".format(u"".join(self.chars))) | |||
| except UnicodeEncodeError: | |||
| self.chars_print = "abcdefghijklmnopqrstuvwxyzaaeeeeiinoouuacelnszz" | |||
| print("Alphabet: " + self.chars_print) | |||
| self.chars_print = list(self.chars_print) | |||
| print(""" | |||
| NOTE: Your terminal does not appear to support printing Unicode characters. | |||
| For the purposes of printing to the terminal, some of the letters in the | |||
| alphabet above have been substituted with ASCII symbols.""".strip()) | |||
| print("") | |||
| # Select some examples to spotlight in the monitoring phase (3 per language) | |||
| spotlight_idxs = [] | |||
| for i in range(len(self.language_names)): | |||
| idxs_lang_i = np.nonzero(self.dev_y == i)[0] | |||
| idxs_lang_i = np.random.choice(idxs_lang_i, size=3, replace=False) | |||
| spotlight_idxs.extend(list(idxs_lang_i)) | |||
| self.spotlight_idxs = np.array(spotlight_idxs, dtype=int) | |||
| # Templates for printing updates as training progresses | |||
| max_word_len = self.dev_x.shape[1] | |||
| max_lang_len = max([len(x) for x in self.language_names]) | |||
| self.predicted_template = u"Pred: {:<NUM}".replace('NUM', | |||
| str(max_lang_len)) | |||
| self.word_template = u" " | |||
| self.word_template += u"{:<NUM} ".replace('NUM', str(max_word_len)) | |||
| self.word_template += u"{:<NUM} ({:6.1%})".replace('NUM', str(max_lang_len)) | |||
| self.word_template += u" {:<NUM} ".replace('NUM', | |||
| str(max_lang_len + len('Pred: '))) | |||
| for i in range(len(self.language_names)): | |||
| self.word_template += u"|{}".format(self.language_codes[i]) | |||
| self.word_template += "{probs[" + str(i) + "]:4.0%}" | |||
| self.last_update = time.time() | |||
| def _encode(self, inp_x, inp_y): | |||
| xs = [] | |||
| for i in range(inp_x.shape[1]): | |||
| if np.all(inp_x[:,i] == -1): | |||
| break | |||
| assert not np.any(inp_x[:,i] == -1), ( | |||
| "Please report this error in the project: batching by length was done incorrectly in the provided code") | |||
| x = np.eye(len(self.chars))[inp_x[:,i]] | |||
| xs.append(nn.Constant(x)) | |||
| y = np.eye(len(self.language_names))[inp_y] | |||
| y = nn.Constant(y) | |||
| return xs, y | |||
| def _softmax(self, x): | |||
| exp = np.exp(x - np.max(x, axis=-1, keepdims=True)) | |||
| return exp / np.sum(exp, axis=-1, keepdims=True) | |||
| def _predict(self, split='dev'): | |||
| if split == 'dev': | |||
| data_x = self.dev_x | |||
| data_y = self.dev_y | |||
| buckets = self.dev_buckets | |||
| else: | |||
| data_x = self.test_x | |||
| data_y = self.test_y | |||
| buckets = self.test_buckets | |||
| all_predicted = [] | |||
| all_correct = [] | |||
| for bucket_id in range(buckets.shape[0]): | |||
| start, end = buckets[bucket_id] | |||
| xs, y = self._encode(data_x[start:end], data_y[start:end]) | |||
| predicted = self.model.run(xs) | |||
| all_predicted.extend(list(predicted.data)) | |||
| all_correct.extend(list(data_y[start:end])) | |||
| all_predicted_probs = self._softmax(np.asarray(all_predicted)) | |||
| all_predicted = np.asarray(all_predicted).argmax(axis=-1) | |||
| all_correct = np.asarray(all_correct) | |||
| return all_predicted_probs, all_predicted, all_correct | |||
| def iterate_once(self, batch_size): | |||
| assert isinstance(batch_size, int) and batch_size > 0, ( | |||
| "Batch size should be a positive integer, got {!r}".format( | |||
| batch_size)) | |||
| assert self.train_x.shape[0] >= batch_size, ( | |||
| "Dataset size {:d} is smaller than the batch size {:d}".format( | |||
| self.train_x.shape[0], batch_size)) | |||
| self.epoch += 1 | |||
| for iteration in range(self.train_x.shape[0] // batch_size): | |||
| bucket_id = np.random.choice(self.bucket_weights.shape[0], p=self.bucket_weights) | |||
| example_ids = self.train_buckets[bucket_id, 0] + np.random.choice( | |||
| self.train_buckets[bucket_id, 1] - self.train_buckets[bucket_id, 0], | |||
| size=batch_size) | |||
| yield self._encode(self.train_x[example_ids], self.train_y[example_ids]) | |||
| if use_graphics and time.time() - self.last_update > 0.5: | |||
| dev_predicted_probs, dev_predicted, dev_correct = self._predict() | |||
| dev_accuracy = np.mean(dev_predicted == dev_correct) | |||
| print("epoch {:,} iteration {:,} validation-accuracy {:.1%}".format( | |||
| self.epoch, iteration, dev_accuracy)) | |||
| for idx in self.spotlight_idxs: | |||
| correct = (dev_predicted[idx] == dev_correct[idx]) | |||
| word = u"".join([self.chars_print[ch] for ch in self.dev_x[idx] if ch != -1]) | |||
| print(self.word_template.format( | |||
| word, | |||
| self.language_names[dev_correct[idx]], | |||
| dev_predicted_probs[idx, dev_correct[idx]], | |||
| "" if correct else self.predicted_template.format( | |||
| self.language_names[dev_predicted[idx]]), | |||
| probs=dev_predicted_probs[idx,:], | |||
| )) | |||
| self.last_update = time.time() | |||
| def get_validation_accuracy(self): | |||
| dev_predicted_probs, dev_predicted, dev_correct = self._predict() | |||
| dev_accuracy = np.mean(dev_predicted == dev_correct) | |||
| return dev_accuracy | |||
| def main(): | |||
| import models | |||
| # model = models.PerceptronModel(3) | |||
| # dataset = PerceptronDataset(model) | |||
| # model.train(dataset) | |||
| # model = models.RegressionModel() | |||
| # dataset = RegressionDataset(model) | |||
| # model.train(dataset) | |||
| model = models.DigitClassificationModel() | |||
| dataset = DigitClassificationDataset(model) | |||
| model.train(dataset) | |||
| # model = models.LanguageIDModel() | |||
| # dataset = LanguageIDDataset(model) | |||
| # model.train(dataset) | |||
| if __name__ == "__main__": | |||
| main() | |||
| @@ -0,0 +1,292 @@ | |||
| import nn | |||
| class PerceptronModel(object): | |||
| def __init__(self, dimensions): | |||
| """ | |||
| Initialize a new Perceptron instance. | |||
| A perceptron classifies data points as either belonging to a particular | |||
| class (+1) or not (-1). `dimensions` is the dimensionality of the data. | |||
| For example, dimensions=2 would mean that the perceptron must classify | |||
| 2D points. | |||
| """ | |||
| self.w = nn.Parameter(1, dimensions) | |||
| def get_weights(self): | |||
| """ | |||
| Return a Parameter instance with the current weights of the perceptron. | |||
| """ | |||
| return self.w | |||
| def run(self, x): | |||
| """ | |||
| Calculates the score assigned by the perceptron to a data point x. | |||
| Inputs: | |||
| x: a node with shape (1 x dimensions) | |||
| Returns: a node containing a single number (the score) | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| return nn.DotProduct(x, self.get_weights()) | |||
| def get_prediction(self, x): | |||
| """ | |||
| Calculates the predicted class for a single data point `x`. | |||
| Returns: 1 or -1 | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| score = self.run(x) | |||
| if nn.as_scalar(score) >= 0: | |||
| return 1 | |||
| else: | |||
| return -1 | |||
| def train(self, dataset): | |||
| """ | |||
| Train the perceptron until convergence. | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| batch_size = 1 | |||
| while True: | |||
| converged = True | |||
| for x, y in dataset.iterate_once(batch_size): | |||
| prediction = self.get_prediction(x) | |||
| print(x, y) | |||
| assert 0 | |||
| if prediction != nn.as_scalar(y): | |||
| converged = False | |||
| self.w.update(x, nn.as_scalar(y)) | |||
| if converged: | |||
| break | |||
| class RegressionModel(object): | |||
| """ | |||
| A neural network model for approximating a function that maps from real | |||
| numbers to real numbers. The network should be sufficiently large to be able | |||
| to approximate sin(x) on the interval [-2pi, 2pi] to reasonable precision. | |||
| """ | |||
| def __init__(self): | |||
| # Initialize your model parameters here | |||
| "*** YOUR CODE HERE ***" | |||
| self.i = 1 | |||
| self.o = 1 | |||
| self.h = 50 | |||
| self.b = 10 | |||
| self.learning_rate = 0.01 | |||
| self.W1 = nn.Parameter(self.i, self.h) | |||
| self.b1 = nn.Parameter(1, self.h) | |||
| self.W2 = nn.Parameter(self.h, self.o) | |||
| self.b2 = nn.Parameter(1, self.o) | |||
| def run(self, x): | |||
| """ | |||
| Runs the model for a batch of examples. | |||
| Inputs: | |||
| x: a node with shape (batch_size x 1) | |||
| Returns: | |||
| A node with shape (batch_size x 1) containing predicted y-values | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| layer_1 = nn.ReLU(nn.AddBias(nn.Linear(x, self.W1), self.b1)) | |||
| prediction = nn.AddBias(nn.Linear(layer_1, self.W2), self.b2) | |||
| return prediction | |||
| def get_loss(self, x, y): | |||
| """ | |||
| Computes the loss for a batch of examples. | |||
| Inputs: | |||
| x: a node with shape (batch_size x 1) | |||
| y: a node with shape (batch_size x 1), containing the true y-values | |||
| to be used for training | |||
| Returns: a loss node | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| return nn.SquareLoss(self.run(x), y) | |||
| def train(self, dataset): | |||
| """ | |||
| Trains the model. | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| for i in range(20): | |||
| for x, y in dataset.iterate_once(self.b): | |||
| loss = self.get_loss(x, y) | |||
| print(loss.data) | |||
| g_W1, g_b1, g_W2, g_b2 = nn.gradients(loss, [self.W1, self.b1, self.W2, self.b2]) | |||
| # print(g_W1.data) | |||
| # print(g_b1.data) | |||
| # print(g_W2.data) | |||
| # print(g_b2.data) | |||
| self.W1.update(g_W1, -self.learning_rate) | |||
| self.b1.update(g_b1, -self.learning_rate) | |||
| self.W2.update(g_W2, -self.learning_rate) | |||
| self.b2.update(g_b2, -self.learning_rate) | |||
| if loss.data < 0.01: | |||
| break | |||
| class DigitClassificationModel(object): | |||
| """ | |||
| A model for handwritten digit classification using the MNIST dataset. | |||
| Each handwritten digit is a 28x28 pixel grayscale image, which is flattened | |||
| into a 784-dimensional vector for the purposes of this model. Each entry in | |||
| the vector is a floating point number between 0 and 1. | |||
| The goal is to sort each digit into one of 10 classes (number 0 through 9). | |||
| (See RegressionModel for more information about the APIs of different | |||
| methods here. We recommend that you implement the RegressionModel before | |||
| working on this part of the project.) | |||
| """ | |||
| def __init__(self): | |||
| # Initialize your model parameters here | |||
| "*** YOUR CODE HERE ***" | |||
| self.input_features = 784 | |||
| self.h1 = 200 | |||
| self.h2 = 100 | |||
| self.output_features = 10 | |||
| self.lr = 0.01 | |||
| self.batch_size = 100 | |||
| self.w1 = nn.Parameter(self.input_features, self.h1) | |||
| self.b1 = nn.Parameter(1, self.h1) | |||
| self.w2 = nn.Parameter(self.h1, self.h2) | |||
| self.b2 = nn.Parameter(1, self.h2) | |||
| self.w3 = nn.Parameter(self.h2, self.output_features) | |||
| self.b3 = nn.Parameter(1, self.output_features) | |||
| def run(self, x): | |||
| """ | |||
| Runs the model for a batch of examples. | |||
| Your model should predict a node with shape (batch_size x 10), | |||
| containing scores. Higher scores correspond to greater probability of | |||
| the image belonging to a particular class. | |||
| Inputs: | |||
| x: a node with shape (batch_size x 784) | |||
| Output: | |||
| A node with shape (batch_size x 10) containing predicted scores | |||
| (also called logits) | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| l1 = nn.ReLU(nn.AddBias(nn.Linear(x, self.w1), self.b1)) | |||
| l2 = nn.ReLU(nn.AddBias(nn.Linear(l1, self.w2), self.b2)) | |||
| l3 = nn.AddBias(nn.Linear(l2, self.w3), self.b3) | |||
| return l3 | |||
| def get_loss(self, x, y): | |||
| """ | |||
| Computes the loss for a batch of examples. | |||
| The correct labels `y` are represented as a node with shape | |||
| (batch_size x 10). Each row is a one-hot vector encoding the correct | |||
| digit class (0-9). | |||
| Inputs: | |||
| x: a node with shape (batch_size x 784) | |||
| y: a node with shape (batch_size x 10) | |||
| Returns: a loss node | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| return nn.SoftmaxLoss(self.run(x), y) | |||
| def train(self, dataset): | |||
| """ | |||
| Trains the model. | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| while True: | |||
| for x, y in dataset.iterate_once(self.batch_size): | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1, g_w2, g_b2, g_w3, g_b3 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2, self.w3, self.b3]) | |||
| self.w1.update(g_w1, -self.lr) | |||
| self.b1.update(g_b1, -self.lr) | |||
| self.w2.update(g_w2, -self.lr) | |||
| self.b2.update(g_b2, -self.lr) | |||
| self.w3.update(g_w3, -self.lr) | |||
| self.b3.update(g_b3, -self.lr) | |||
| accuracy = dataset.get_validation_accuracy() | |||
| print(accuracy) | |||
| if accuracy > 0.95: | |||
| break | |||
| class LanguageIDModel(object): | |||
| """ | |||
| A model for language identification at a single-word granularity. | |||
| (See RegressionModel for more information about the APIs of different | |||
| methods here. We recommend that you implement the RegressionModel before | |||
| working on this part of the project.) | |||
| """ | |||
| def __init__(self): | |||
| # Our dataset contains words from five different languages, and the | |||
| # combined alphabets of the five languages contain a total of 47 unique | |||
| # characters. | |||
| # You can refer to self.num_chars or len(self.languages) in your code | |||
| self.num_chars = 47 | |||
| self.languages = ["English", "Spanish", "Finnish", "Dutch", "Polish"] | |||
| # Initialize your model parameters here | |||
| "*** YOUR CODE HERE ***" | |||
| def run(self, xs): | |||
| """ | |||
| Runs the model for a batch of examples. | |||
| Although words have different lengths, our data processing guarantees | |||
| that within a single batch, all words will be of the same length (L). | |||
| Here `xs` will be a list of length L. Each element of `xs` will be a | |||
| node with shape (batch_size x self.num_chars), where every row in the | |||
| array is a one-hot vector encoding of a character. For example, if we | |||
| have a batch of 8 three-letter words where the last word is "cat", then | |||
| xs[1] will be a node that contains a 1 at position (7, 0). Here the | |||
| index 7 reflects the fact that "cat" is the last word in the batch, and | |||
| the index 0 reflects the fact that the letter "a" is the inital (0th) | |||
| letter of our combined alphabet for this task. | |||
| Your model should use a Recurrent Neural Network to summarize the list | |||
| `xs` into a single node of shape (batch_size x hidden_size), for your | |||
| choice of hidden_size. It should then calculate a node of shape | |||
| (batch_size x 5) containing scores, where higher scores correspond to | |||
| greater probability of the word originating from a particular language. | |||
| Inputs: | |||
| xs: a list with L elements (one per character), where each element | |||
| is a node with shape (batch_size x self.num_chars) | |||
| Returns: | |||
| A node with shape (batch_size x 5) containing predicted scores | |||
| (also called logits) | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| def get_loss(self, xs, y): | |||
| """ | |||
| Computes the loss for a batch of examples. | |||
| The correct labels `y` are represented as a node with shape | |||
| (batch_size x 5). Each row is a one-hot vector encoding the correct | |||
| language. | |||
| Inputs: | |||
| xs: a list with L elements (one per character), where each element | |||
| is a node with shape (batch_size x self.num_chars) | |||
| y: a node with shape (batch_size x 5) | |||
| Returns: a loss node | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| def train(self, dataset): | |||
| """ | |||
| Trains the model. | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| @@ -0,0 +1,393 @@ | |||
| import numpy as np | |||
| np.random.seed(42) | |||
| def format_shape(shape): | |||
| return "x".join(map(str, shape)) if shape else "()" | |||
| class Node(object): | |||
| def __repr__(self): | |||
| return "<{} shape={} at {}>".format( | |||
| type(self).__name__, format_shape(self.data.shape), hex(id(self))) | |||
| class DataNode(Node): | |||
| """ | |||
| DataNode is the parent class for Parameter and Constant nodes. | |||
| You should not need to use this class directly. | |||
| """ | |||
| def __init__(self, data): | |||
| self.parents = [] | |||
| self.data = data | |||
| def _forward(self, *inputs): | |||
| return self.data | |||
| @staticmethod | |||
| def _backward(gradient, *inputs): | |||
| return [] | |||
| class Parameter(DataNode): | |||
| """ | |||
| A Parameter node stores parameters used in a neural network (or perceptron). | |||
| Use the the `update` method to update parameters when training the | |||
| perceptron or neural network. | |||
| """ | |||
| def __init__(self, *shape): | |||
| assert len(shape) == 2, ( | |||
| "Shape must have 2 dimensions, instead has {}".format(len(shape))) | |||
| assert all(isinstance(dim, int) and dim > 0 for dim in shape), ( | |||
| "Shape must consist of positive integers, got {!r}".format(shape)) | |||
| limit = np.sqrt(3.0 / np.mean(shape)) | |||
| data = np.random.uniform(low=-limit, high=limit, size=shape) | |||
| super().__init__(data) | |||
| def update(self, direction, multiplier): | |||
| assert isinstance(direction, Constant), ( | |||
| "Update direction must be a {} node, instead has type {!r}".format( | |||
| Constant.__name__, type(direction).__name__)) | |||
| assert direction.data.shape == self.data.shape, ( | |||
| "Update direction shape {} does not match parameter shape " | |||
| "{}".format( | |||
| format_shape(direction.data.shape), | |||
| format_shape(self.data.shape))) | |||
| assert isinstance(multiplier, (int, float)), ( | |||
| "Multiplier must be a Python scalar, instead has type {!r}".format( | |||
| type(multiplier).__name__)) | |||
| self.data += multiplier * direction.data | |||
| assert np.all(np.isfinite(self.data)), ( | |||
| "Parameter contains NaN or infinity after update, cannot continue") | |||
| class Constant(DataNode): | |||
| """ | |||
| A Constant node is used to represent: | |||
| * Input features | |||
| * Output labels | |||
| * Gradients computed by back-propagation | |||
| You should not need to construct any Constant nodes directly; they will | |||
| instead be provided by either the dataset or when you call `nn.gradients`. | |||
| """ | |||
| def __init__(self, data): | |||
| assert isinstance(data, np.ndarray), ( | |||
| "Data should be a numpy array, instead has type {!r}".format( | |||
| type(data).__name__)) | |||
| assert np.issubdtype(data.dtype, np.floating), ( | |||
| "Data should be a float array, instead has data type {!r}".format( | |||
| data.dtype)) | |||
| super().__init__(data) | |||
| class FunctionNode(Node): | |||
| """ | |||
| A FunctionNode represents a value that is computed based on other nodes. | |||
| The FunctionNode class performs necessary book-keeping to compute gradients. | |||
| """ | |||
| def __init__(self, *parents): | |||
| assert all(isinstance(parent, Node) for parent in parents), ( | |||
| "Inputs must be node objects, instead got types {!r}".format( | |||
| tuple(type(parent).__name__ for parent in parents))) | |||
| self.parents = parents | |||
| self.data = self._forward(*(parent.data for parent in parents)) | |||
| class Add(FunctionNode): | |||
| """ | |||
| Adds matrices element-wise. | |||
| Usage: nn.Add(x, y) | |||
| Inputs: | |||
| x: a Node with shape (batch_size x num_features) | |||
| y: a Node with the same shape as x | |||
| Output: | |||
| a Node with shape (batch_size x num_features) | |||
| """ | |||
| @staticmethod | |||
| def _forward(*inputs): | |||
| assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) | |||
| assert inputs[0].ndim == 2, ( | |||
| "First input should have 2 dimensions, instead has {}".format( | |||
| inputs[0].ndim)) | |||
| assert inputs[1].ndim == 2, ( | |||
| "Second input should have 2 dimensions, instead has {}".format( | |||
| inputs[1].ndim)) | |||
| assert inputs[0].shape == inputs[1].shape, ( | |||
| "Input shapes should match, instead got {} and {}".format( | |||
| format_shape(inputs[0].shape), format_shape(inputs[1].shape))) | |||
| return inputs[0] + inputs[1] | |||
| @staticmethod | |||
| def _backward(gradient, *inputs): | |||
| assert gradient.shape == inputs[0].shape | |||
| return [gradient, gradient] | |||
| class AddBias(FunctionNode): | |||
| """ | |||
| Adds a bias vector to each feature vector | |||
| Usage: nn.AddBias(features, bias) | |||
| Inputs: | |||
| features: a Node with shape (batch_size x num_features) | |||
| bias: a Node with shape (1 x num_features) | |||
| Output: | |||
| a Node with shape (batch_size x num_features) | |||
| """ | |||
| @staticmethod | |||
| def _forward(*inputs): | |||
| assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) | |||
| assert inputs[0].ndim == 2, ( | |||
| "First input should have 2 dimensions, instead has {}".format( | |||
| inputs[0].ndim)) | |||
| assert inputs[1].ndim == 2, ( | |||
| "Second input should have 2 dimensions, instead has {}".format( | |||
| inputs[1].ndim)) | |||
| assert inputs[1].shape[0] == 1, ( | |||
| "First dimension of second input should be 1, instead got shape " | |||
| "{}".format(format_shape(inputs[1].shape))) | |||
| assert inputs[0].shape[1] == inputs[1].shape[1], ( | |||
| "Second dimension of inputs should match, instead got shapes {} " | |||
| "and {}".format( | |||
| format_shape(inputs[0].shape), format_shape(inputs[1].shape))) | |||
| return inputs[0] + inputs[1] | |||
| @staticmethod | |||
| def _backward(gradient, *inputs): | |||
| assert gradient.shape == inputs[0].shape | |||
| return [gradient, np.sum(gradient, axis=0, keepdims=True)] | |||
| class DotProduct(FunctionNode): | |||
| """ | |||
| Batched dot product | |||
| Usage: nn.DotProduct(features, weights) | |||
| Inputs: | |||
| features: a Node with shape (batch_size x num_features) | |||
| weights: a Node with shape (1 x num_features) | |||
| Output: a Node with shape (batch_size x 1) | |||
| """ | |||
| @staticmethod | |||
| def _forward(*inputs): | |||
| assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) | |||
| assert inputs[0].ndim == 2, ( | |||
| "First input should have 2 dimensions, instead has {}".format( | |||
| inputs[0].ndim)) | |||
| assert inputs[1].ndim == 2, ( | |||
| "Second input should have 2 dimensions, instead has {}".format( | |||
| inputs[1].ndim)) | |||
| assert inputs[1].shape[0] == 1, ( | |||
| "First dimension of second input should be 1, instead got shape " | |||
| "{}".format(format_shape(inputs[1].shape))) | |||
| assert inputs[0].shape[1] == inputs[1].shape[1], ( | |||
| "Second dimension of inputs should match, instead got shapes {} " | |||
| "and {}".format( | |||
| format_shape(inputs[0].shape), format_shape(inputs[1].shape))) | |||
| return np.dot(inputs[0], inputs[1].T) | |||
| @staticmethod | |||
| def _backward(gradient, *inputs): | |||
| # assert gradient.shape[0] == inputs[0].shape[0] | |||
| # assert gradient.shape[1] == 1 | |||
| # return [np.dot(gradient, inputs[1]), np.dot(gradient.T, inputs[0])] | |||
| raise NotImplementedError( | |||
| "Backpropagation through DotProduct nodes is not needed in this " | |||
| "assignment") | |||
| class Linear(FunctionNode): | |||
| """ | |||
| Applies a linear transformation (matrix multiplication) to the input | |||
| Usage: nn.Linear(features, weights) | |||
| Inputs: | |||
| features: a Node with shape (batch_size x input_features) | |||
| weights: a Node with shape (input_features x output_features) | |||
| Output: a node with shape (batch_size x input_features) | |||
| """ | |||
| @staticmethod | |||
| def _forward(*inputs): | |||
| assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) | |||
| assert inputs[0].ndim == 2, ( | |||
| "First input should have 2 dimensions, instead has {}".format( | |||
| inputs[0].ndim)) | |||
| assert inputs[1].ndim == 2, ( | |||
| "Second input should have 2 dimensions, instead has {}".format( | |||
| inputs[1].ndim)) | |||
| assert inputs[0].shape[1] == inputs[1].shape[0], ( | |||
| "Second dimension of first input should match first dimension of " | |||
| "second input, instead got shapes {} and {}".format( | |||
| format_shape(inputs[0].shape), format_shape(inputs[1].shape))) | |||
| return np.dot(inputs[0], inputs[1]) | |||
| @staticmethod | |||
| def _backward(gradient, *inputs): | |||
| assert gradient.shape[0] == inputs[0].shape[0] | |||
| assert gradient.shape[1] == inputs[1].shape[1] | |||
| return [np.dot(gradient, inputs[1].T), np.dot(inputs[0].T, gradient)] | |||
| class ReLU(FunctionNode): | |||
| """ | |||
| An element-wise Rectified Linear Unit nonlinearity: max(x, 0). | |||
| This nonlinearity replaces all negative entries in its input with zeros. | |||
| Usage: nn.ReLU(x) | |||
| Input: | |||
| x: a Node with shape (batch_size x num_features) | |||
| Output: a Node with the same shape as x, but no negative entries | |||
| """ | |||
| @staticmethod | |||
| def _forward(*inputs): | |||
| assert len(inputs) == 1, "Expected 1 input, got {}".format(len(inputs)) | |||
| assert inputs[0].ndim == 2, ( | |||
| "Input should have 2 dimensions, instead has {}".format( | |||
| inputs[0].ndim)) | |||
| return np.maximum(inputs[0], 0) | |||
| @staticmethod | |||
| def _backward(gradient, *inputs): | |||
| assert gradient.shape == inputs[0].shape | |||
| return [gradient * np.where(inputs[0] > 0, 1.0, 0.0)] | |||
| class SquareLoss(FunctionNode): | |||
| """ | |||
| This node first computes 0.5 * (a[i,j] - b[i,j])**2 at all positions (i,j) | |||
| in the inputs, which creates a (batch_size x dim) matrix. It then calculates | |||
| and returns the mean of all elements in this matrix. | |||
| Usage: nn.SquareLoss(a, b) | |||
| Inputs: | |||
| a: a Node with shape (batch_size x dim) | |||
| b: a Node with shape (batch_size x dim) | |||
| Output: a scalar Node (containing a single floating-point number) | |||
| """ | |||
| @staticmethod | |||
| def _forward(*inputs): | |||
| assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) | |||
| assert inputs[0].ndim == 2, ( | |||
| "First input should have 2 dimensions, instead has {}".format( | |||
| inputs[0].ndim)) | |||
| assert inputs[1].ndim == 2, ( | |||
| "Second input should have 2 dimensions, instead has {}".format( | |||
| inputs[1].ndim)) | |||
| assert inputs[0].shape == inputs[1].shape, ( | |||
| "Input shapes should match, instead got {} and {}".format( | |||
| format_shape(inputs[0].shape), format_shape(inputs[1].shape))) | |||
| return np.mean(np.square(inputs[0] - inputs[1]) / 2) | |||
| @staticmethod | |||
| def _backward(gradient, *inputs): | |||
| assert np.asarray(gradient).ndim == 0 | |||
| return [ | |||
| gradient * (inputs[0] - inputs[1]) / inputs[0].size, | |||
| gradient * (inputs[1] - inputs[0]) / inputs[0].size | |||
| ] | |||
| class SoftmaxLoss(FunctionNode): | |||
| """ | |||
| A batched softmax loss, used for classification problems. | |||
| IMPORTANT: do not swap the order of the inputs to this node! | |||
| Usage: nn.SoftmaxLoss(logits, labels) | |||
| Inputs: | |||
| logits: a Node with shape (batch_size x num_classes). Each row | |||
| represents the scores associated with that example belonging to a | |||
| particular class. A score can be an arbitrary real number. | |||
| labels: a Node with shape (batch_size x num_classes) that encodes the | |||
| correct labels for the examples. All entries must be non-negative | |||
| and the sum of values along each row should be 1. | |||
| Output: a scalar Node (containing a single floating-point number) | |||
| """ | |||
| @staticmethod | |||
| def log_softmax(logits): | |||
| log_probs = logits - np.max(logits, axis=1, keepdims=True) | |||
| log_probs -= np.log(np.sum(np.exp(log_probs), axis=1, keepdims=True)) | |||
| return log_probs | |||
| @staticmethod | |||
| def _forward(*inputs): | |||
| assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) | |||
| assert inputs[0].ndim == 2, ( | |||
| "First input should have 2 dimensions, instead has {}".format( | |||
| inputs[0].ndim)) | |||
| assert inputs[1].ndim == 2, ( | |||
| "Second input should have 2 dimensions, instead has {}".format( | |||
| inputs[1].ndim)) | |||
| assert inputs[0].shape == inputs[1].shape, ( | |||
| "Input shapes should match, instead got {} and {}".format( | |||
| format_shape(inputs[0].shape), format_shape(inputs[1].shape))) | |||
| assert np.all(inputs[1] >= 0), ( | |||
| "All entries in the labels input must be non-negative") | |||
| assert np.allclose(np.sum(inputs[1], axis=1), 1), ( | |||
| "Labels input must sum to 1 along each row") | |||
| log_probs = SoftmaxLoss.log_softmax(inputs[0]) | |||
| return np.mean(-np.sum(inputs[1] * log_probs, axis=1)) | |||
| @staticmethod | |||
| def _backward(gradient, *inputs): | |||
| assert np.asarray(gradient).ndim == 0 | |||
| log_probs = SoftmaxLoss.log_softmax(inputs[0]) | |||
| return [ | |||
| gradient * (np.exp(log_probs) - inputs[1]) / inputs[0].shape[0], | |||
| gradient * -log_probs / inputs[0].shape[0] | |||
| ] | |||
| def gradients(loss, parameters): | |||
| """ | |||
| Computes and returns the gradient of the loss with respect to the provided | |||
| parameters. | |||
| Usage: nn.gradients(loss, parameters) | |||
| Inputs: | |||
| loss: a SquareLoss or SoftmaxLoss node | |||
| parameters: a list (or iterable) containing Parameter nodes | |||
| Output: a list of Constant objects, representing the gradient of the loss | |||
| with respect to each provided parameter. | |||
| """ | |||
| assert isinstance(loss, (SquareLoss, SoftmaxLoss)), ( | |||
| "Loss must be a loss node, instead has type {!r}".format( | |||
| type(loss).__name__)) | |||
| assert all(isinstance(parameter, Parameter) for parameter in parameters), ( | |||
| "Parameters must all have type {}, instead got types {!r}".format( | |||
| Parameter.__name__, | |||
| tuple(type(parameter).__name__ for parameter in parameters))) | |||
| assert not hasattr(loss, "used"), ( | |||
| "Loss node has already been used for backpropagation, cannot reuse") | |||
| loss.used = True | |||
| nodes = set() | |||
| tape = [] | |||
| def visit(node): | |||
| if node not in nodes: | |||
| for parent in node.parents: | |||
| visit(parent) | |||
| nodes.add(node) | |||
| tape.append(node) | |||
| visit(loss) | |||
| nodes |= set(parameters) | |||
| grads = {node: np.zeros_like(node.data) for node in nodes} | |||
| grads[loss] = 1.0 | |||
| for node in reversed(tape): | |||
| parent_grads = node._backward( | |||
| grads[node], *(parent.data for parent in node.parents)) | |||
| for parent, parent_grad in zip(node.parents, parent_grads): | |||
| grads[parent] += parent_grad | |||
| return [Constant(grads[parameter]) for parameter in parameters] | |||
| def as_scalar(node): | |||
| """ | |||
| Returns the value of a Node as a standard Python number. This only works | |||
| for nodes with one element (e.g. SquareLoss and SoftmaxLoss, as well as | |||
| DotProduct with a batch size of 1 element). | |||
| """ | |||
| assert isinstance(node, Node), ( | |||
| "Input must be a node object, instead has type {!r}".format( | |||
| type(node).__name__)) | |||
| assert node.data.size == 1, ( | |||
| "Node has shape {}, cannot convert to a scalar".format( | |||
| format_shape(node.data.shape))) | |||
| node.data = node.data.flatten() | |||
| return node.data.tolist()[0] | |||
| @@ -0,0 +1,36 @@ | |||
| import collections | |||
| import os | |||
| import time | |||
| import matplotlib.pyplot as plt | |||
| import numpy as np | |||
| import uctc.nn as nn | |||
| use_graphics = True | |||
| def maybe_sleep_and_close(seconds): | |||
| if use_graphics and plt.get_fignums(): | |||
| time.sleep(seconds) | |||
| for fignum in plt.get_fignums(): | |||
| fig = plt.figure(fignum) | |||
| plt.close(fig) | |||
| try: | |||
| # This raises a TclError on some Windows machines | |||
| fig.canvas.start_event_loop(1e-3) | |||
| except: | |||
| pass | |||
| def get_data_path(filename): | |||
| path = os.path.join( | |||
| os.path.dirname(__file__), os.pardir, "data", filename) | |||
| if not os.path.exists(path): | |||
| path = os.path.join( | |||
| os.path.dirname(__file__), "data", filename) | |||
| if not os.path.exists(path): | |||
| path = os.path.join( | |||
| os.path.dirname(__file__), filename) | |||
| if not os.path.exists(path): | |||
| raise Exception("Could not find data file: {}".format(filename)) | |||
| return path | |||
| @@ -0,0 +1,232 @@ | |||
| import numpy as np | |||
| import time | |||
| import os | |||
| import collections | |||
| import matplotlib.pyplot as plt | |||
| import uctc.nn as nn | |||
| from utils import parameter_data, Dataset | |||
| use_graphics = False | |||
| class DigitClassificationModel(object): | |||
| """ | |||
| A model for handwritten digit classification using the MNIST dataset. | |||
| Each handwritten digit is a 28x28 pixel grayscale image, which is flattened | |||
| into a 784-dimensional vector for the purposes of this model. Each entry in | |||
| the vector is a floating point number between 0 and 1. | |||
| The goal is to sort each digit into one of 10 classes (number 0 through 9). | |||
| (See RegressionModel for more information about the APIs of different | |||
| methods here. We recommend that you implement the RegressionModel before | |||
| working on this part of the project.) | |||
| """ | |||
| def __init__(self): | |||
| # Initialize your model parameters here | |||
| "*** YOUR CODE HERE ***" | |||
| self.input_features = 784 | |||
| self.h1 = 200 | |||
| self.h2 = 100 | |||
| self.output_features = 10 | |||
| self.lr = 0.01 | |||
| self.batch_size = 100 | |||
| self.w1 = nn.Parameter(parameter_data(self.input_features, self.h1)) | |||
| self.b1 = nn.Parameter(parameter_data(1, self.h1)) | |||
| self.w2 = nn.Parameter(parameter_data(self.h1, self.h2)) | |||
| self.b2 = nn.Parameter(parameter_data(1, self.h2)) | |||
| self.w3 = nn.Parameter(parameter_data(self.h2, self.output_features)) | |||
| self.b3 = nn.Parameter(parameter_data(1, self.output_features)) | |||
| def run(self, x): | |||
| """ | |||
| Runs the model for a batch of examples. | |||
| Your model should predict a node with shape (batch_size x 10), | |||
| containing scores. Higher scores correspond to greater probability of | |||
| the image belonging to a particular class. | |||
| Inputs: | |||
| x: a node with shape (batch_size x 784) | |||
| Output: | |||
| A node with shape (batch_size x 10) containing predicted scores | |||
| (also called logits) | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| l1 = nn.ReLU(nn.AddBias(nn.Linear(x, self.w1), self.b1)) | |||
| l2 = nn.ReLU(nn.AddBias(nn.Linear(l1, self.w2), self.b2)) | |||
| l3 = nn.AddBias(nn.Linear(l2, self.w3), self.b3) | |||
| return l3 | |||
| def get_loss(self, x, y): | |||
| """ | |||
| Computes the loss for a batch of examples. | |||
| The correct labels `y` are represented as a node with shape | |||
| (batch_size x 10). Each row is a one-hot vector encoding the correct | |||
| digit class (0-9). | |||
| Inputs: | |||
| x: a node with shape (batch_size x 784) | |||
| y: a node with shape (batch_size x 10) | |||
| Returns: a loss node | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| return nn.SoftmaxLoss(self.run(x), y) | |||
| def train(self, dataset): | |||
| """ | |||
| Trains the model. | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| while True: | |||
| for x, y in dataset.iterate_once(self.batch_size): | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1, g_w2, g_b2, g_w3, g_b3 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2, self.w3, self.b3]) | |||
| self.w1.update(g_w1, self.lr) | |||
| self.b1.update(g_b1, self.lr) | |||
| self.w2.update(g_w2, self.lr) | |||
| self.b2.update(g_b2, self.lr) | |||
| self.w3.update(g_w3, self.lr) | |||
| self.b3.update(g_b3, self.lr) | |||
| accuracy = dataset.get_validation_accuracy() | |||
| print(accuracy) | |||
| if accuracy > 0.95: | |||
| break | |||
| def get_data_path(filename): | |||
| path = os.path.join( | |||
| os.path.dirname(__file__), os.pardir, "data", filename) | |||
| if not os.path.exists(path): | |||
| path = os.path.join( | |||
| os.path.dirname(__file__), "data", filename) | |||
| if not os.path.exists(path): | |||
| path = os.path.join( | |||
| os.path.dirname(__file__), filename) | |||
| if not os.path.exists(path): | |||
| raise Exception("Could not find data file: {}".format(filename)) | |||
| return path | |||
| class DigitClassificationDataset(Dataset): | |||
| def __init__(self, model: DigitClassificationModel): | |||
| mnist_path = get_data_path("mnist.npz") | |||
| with np.load(mnist_path) as data: | |||
| train_images = data["train_images"] | |||
| train_labels = data["train_labels"] | |||
| test_images = data["test_images"] | |||
| test_labels = data["test_labels"] | |||
| assert len(train_images) == len(train_labels) == 60000 | |||
| assert len(test_images) == len(test_labels) == 10000 | |||
| self.dev_images = np.array(test_images[0::2], copy=True) | |||
| self.dev_labels = np.array(test_labels[0::2], copy=True) | |||
| self.test_images = np.array(test_images[1::2], copy=True) | |||
| self.test_labels = np.array(test_labels[1::2], copy=True) | |||
| train_labels_one_hot = np.zeros((len(train_images), 10)) | |||
| train_labels_one_hot[range(len(train_images)), train_labels] = 1 | |||
| super().__init__(train_images, train_labels_one_hot) | |||
| self.model = model | |||
| self.epoch = 0 | |||
| if use_graphics: | |||
| width = 20 # Width of each row expressed as a multiple of image width | |||
| samples = 100 # Number of images to display per label | |||
| fig = plt.figure() | |||
| ax = {} | |||
| images = collections.defaultdict(list) | |||
| texts = collections.defaultdict(list) | |||
| for i in reversed(range(10)): | |||
| ax[i] = plt.subplot2grid((30, 1), (3 * i, 0), 2, 1, | |||
| sharex=ax.get(9)) | |||
| plt.setp(ax[i].get_xticklabels(), visible=i == 9) | |||
| ax[i].set_yticks([]) | |||
| ax[i].text(-0.03, 0.5, i, transform=ax[i].transAxes, | |||
| va="center") | |||
| ax[i].set_xlim(0, 28 * width) | |||
| ax[i].set_ylim(0, 28) | |||
| for j in range(samples): | |||
| images[i].append(ax[i].imshow( | |||
| np.zeros((28, 28)), vmin=0, vmax=1, cmap="Greens", | |||
| alpha=0.3)) | |||
| texts[i].append(ax[i].text( | |||
| 0, 0, "", ha="center", va="top", fontsize="smaller")) | |||
| ax[9].set_xticks(np.linspace(0, 28 * width, 11)) | |||
| ax[9].set_xticklabels( | |||
| ["{:.1f}".format(num) for num in np.linspace(0, 1, 11)]) | |||
| ax[9].tick_params(axis="x", pad=16) | |||
| ax[9].set_xlabel("Probability of Correct Label") | |||
| status = ax[0].text( | |||
| 0.5, 1.5, "", transform=ax[0].transAxes, ha="center", | |||
| va="bottom") | |||
| plt.show(block=False) | |||
| self.width = width | |||
| self.samples = samples | |||
| self.fig = fig | |||
| self.images = images | |||
| self.texts = texts | |||
| self.status = status | |||
| self.last_update = time.time() | |||
| def iterate_once(self, batch_size): | |||
| self.epoch += 1 | |||
| for i, (x, y) in enumerate(super().iterate_once(batch_size)): | |||
| yield x, y | |||
| if time.time() - self.last_update > 1: | |||
| dev_logits = self.model.run(nn.Constant(self.dev_images)).tensor() | |||
| # dev_logits = np.array(dev_logits_raw.data()).reshape(5000, 10) | |||
| # dev_predicted = np.argmax(dev_logits, axis=1) | |||
| dev_argmax = nn.argmax(dev_logits, axis=1) | |||
| dev_predicted = np.array(dev_argmax.data()) | |||
| # sftmax = np.array(nn.log_softmax(nn.pyarray_to_tensor(dev_logits)).data()).reshape(5000, 10) | |||
| sftmax = nn.log_softmax(dev_logits) | |||
| dev_probs = np.array(nn.exp(sftmax).data()).reshape(5000, 10) | |||
| dev_accuracy = np.mean(dev_predicted == self.dev_labels) | |||
| print("epoch: {:d}, batch: {:d}/{:d}, validation accuracy: " | |||
| "{:.2%}".format( | |||
| self.epoch, i, len(self.x) // batch_size, dev_accuracy)) | |||
| if use_graphics: | |||
| self.status.set_text( | |||
| "epoch: {:d}, batch: {:d}/{:d}, validation accuracy: " | |||
| "{:.2%}".format( | |||
| self.epoch, i, len(self.x) // batch_size, dev_accuracy)) | |||
| for i in range(10): | |||
| predicted = dev_predicted[self.dev_labels == i] | |||
| probs = dev_probs[self.dev_labels == i][:, i] | |||
| linspace = np.linspace( | |||
| 0, len(probs) - 1, self.samples).astype(int) | |||
| indices = probs.argsort()[linspace] | |||
| for j, (prob, image) in enumerate(zip( | |||
| probs[indices], | |||
| self.dev_images[self.dev_labels == i][indices])): | |||
| self.images[i][j].set_data(image.reshape((28, 28))) | |||
| left = prob * (self.width - 1) * 28 | |||
| if predicted[indices[j]] == i: | |||
| self.images[i][j].set_cmap("Greens") | |||
| self.texts[i][j].set_text("") | |||
| else: | |||
| self.images[i][j].set_cmap("Reds") | |||
| self.texts[i][j].set_text(predicted[indices[j]]) | |||
| self.texts[i][j].set_x(left + 14) | |||
| self.images[i][j].set_extent([left, left + 28, 0, 28]) | |||
| self.fig.canvas.draw_idle() | |||
| self.fig.canvas.start_event_loop(1e-3) | |||
| self.last_update = time.time() | |||
| def get_validation_accuracy(self): | |||
| # print(self.dev_images[:2].tolist()) | |||
| dev_logits = self.model.run(nn.Constant(self.dev_images)).tensor() | |||
| dev_predicted = np.array(nn.argmax(dev_logits, axis=1).data()) | |||
| dev_accuracy = np.mean(dev_predicted == self.dev_labels) | |||
| return dev_accuracy | |||
| model = DigitClassificationModel() | |||
| dataset = DigitClassificationDataset(model) | |||
| model.train(dataset) | |||
| @@ -0,0 +1,129 @@ | |||
| import numpy as np | |||
| import time | |||
| import os | |||
| import matplotlib.pyplot as plt | |||
| import uctc.nn as nn | |||
| from utils import parameter_data, Dataset | |||
| use_graphics = False | |||
| class PerceptronModel(object): | |||
| def __init__(self, dimensions): | |||
| """ | |||
| Initialize a new Perceptron instance. | |||
| A perceptron classifies data points as either belonging to a particular | |||
| class (+1) or not (-1). `dimensions` is the dimensionality of the data. | |||
| For example, dimensions=2 would mean that the perceptron must classify | |||
| 2D points. | |||
| """ | |||
| self.w = nn.Parameter(parameter_data(dimensions, 1)) | |||
| def get_weights(self): | |||
| """ | |||
| Return a Parameter instance with the current weights of the perceptron. | |||
| """ | |||
| return self.w.data() | |||
| def run(self, x): | |||
| """ | |||
| Calculates the score assigned by the perceptron to a data point x. | |||
| Inputs: | |||
| x: a node with shape (1 x dimensions) | |||
| Returns: a node containing a single number (the score) | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| out = nn.Linear(x, self.w) | |||
| return out | |||
| def get_prediction(self, x): | |||
| """ | |||
| Calculates the predicted class for a single data point `x`. | |||
| Returns: 1 or -1 | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| score = self.run(x).data()[0] | |||
| # score = np.array(x.data()).dot(np.array(self.w.data())) | |||
| if score >= 0: | |||
| return 1 | |||
| else: | |||
| return -1 | |||
| def train(self, dataset): | |||
| """ | |||
| Train the perceptron until convergence. | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| batch_size = 1 | |||
| while True: | |||
| converged = True | |||
| for x, y in dataset.iterate_once(batch_size): | |||
| prediction = self.get_prediction(x) | |||
| x = np.array(x.data(), dtype=np.float32) | |||
| y = int(y.data()[0]) | |||
| # assert 0 | |||
| if prediction != y: | |||
| # print(prediction, y) | |||
| converged = False | |||
| self.w.update(nn.pyarray_to_tensor(x), -y) | |||
| # time.sleep(0.01) | |||
| if converged: | |||
| break | |||
| class PerceptronDataset(Dataset): | |||
| def __init__(self, model: PerceptronModel): | |||
| points = 500 | |||
| x = np.hstack([np.random.randn(points, 2), np.ones((points, 1))]) | |||
| y = np.where(x[:, 0] + 2 * x[:, 1] - 1 >= 0, 1.0, -1.0) | |||
| super().__init__(x, np.expand_dims(y, axis=1)) | |||
| self.model = model | |||
| self.epoch = 0 | |||
| limits = np.array([-3.0, 3.0]) | |||
| if use_graphics: | |||
| fig, ax = plt.subplots(1, 1) | |||
| ax.set_xlim(limits) | |||
| ax.set_ylim(limits) | |||
| positive = ax.scatter(*x[y == 1, :-1].T, color="red", marker="+") | |||
| negative = ax.scatter(*x[y == -1, :-1].T, color="blue", marker="_") | |||
| line, = ax.plot([], [], color="black") | |||
| text = ax.text(0.03, 0.97, "", transform=ax.transAxes, va="top") | |||
| ax.legend([positive, negative], [1, -1]) | |||
| plt.show(block=False) | |||
| self.fig = fig | |||
| self.line = line | |||
| self.text = text | |||
| self.limits = limits | |||
| self.last_update = time.time() | |||
| def iterate_once(self, batch_size): | |||
| self.epoch += 1 | |||
| for i, (x, y) in enumerate(super().iterate_once(batch_size)): | |||
| yield x, y | |||
| if time.time() - self.last_update > 0.001: | |||
| w = self.model.get_weights() | |||
| limits = self.limits | |||
| print(f"epoch: {self.epoch}\npoint: {i * batch_size + 1}/{len(self.x)}\nweights: {w}") | |||
| if use_graphics: | |||
| if w[1] != 0: | |||
| self.line.set_data(limits, (-w[0] * limits - w[2]) / w[1]) | |||
| elif w[0] != 0: | |||
| self.line.set_data(np.full(2, -w[2] / w[0]), limits) | |||
| else: | |||
| self.line.set_data([], []) | |||
| self.text.set_text( | |||
| f"epoch: {self.epoch}\npoint: {i * batch_size + 1}/{len(self.x)}\nweights: {w}") | |||
| self.fig.canvas.draw_idle() | |||
| self.fig.canvas.start_event_loop(1e-3) | |||
| self.last_update = time.time() | |||
| model = PerceptronModel(3) | |||
| dataset = PerceptronDataset(model) | |||
| model.train(dataset) | |||
| @@ -0,0 +1,141 @@ | |||
| import numpy as np | |||
| np.random.seed(42) | |||
| import time | |||
| import os | |||
| import matplotlib.pyplot as plt | |||
| import uctc.nn as nn | |||
| from utils import parameter_data, Dataset | |||
| use_graphics = False | |||
| class RegressionModel(object): | |||
| """ | |||
| A neural network model for approximating a function that maps from real | |||
| numbers to real numbers. The network should be sufficiently large to be able | |||
| to approximate sin(x) on the interval [-2pi, 2pi] to reasonable precision. | |||
| """ | |||
| def __init__(self): | |||
| # Initialize your model parameters here | |||
| self.batch_size = 10 | |||
| self.input_features = 1 | |||
| self.output_features = 1 | |||
| self.hidden_f1 = 50 | |||
| self.lr = 0.01 | |||
| self.w1 = nn.Parameter(parameter_data(self.input_features, self.hidden_f1)) | |||
| self.b1 = nn.Parameter(parameter_data(1, self.hidden_f1)) | |||
| self.w2 = nn.Parameter(parameter_data(self.hidden_f1, self.output_features)) | |||
| self.b2 = nn.Parameter(parameter_data(1, self.output_features)) | |||
| def run(self, x): | |||
| """ | |||
| Runs the model for a batch of examples. | |||
| Inputs: | |||
| x: a node with shape (batch_size x 1) | |||
| Returns: | |||
| A node with shape (batch_size x 1) containing predicted y-values | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| # uctc | |||
| linear1 = nn.Linear(x, self.w1) | |||
| bias1 = nn.AddBias(linear1, self.b1) | |||
| act1 = nn.ReLU(bias1) | |||
| linear2 = nn.Linear(act1, self.w2) | |||
| bias2 = nn.AddBias(linear2, self.b2) | |||
| # numpy | |||
| # print(len(x.data())) | |||
| _x = np.array(x.data()).reshape(-1, 1) | |||
| _w1 = np.array(self.w1.data()).reshape(self.input_features, -1) | |||
| _b1 = np.array(self.b1.data()).reshape(1, -1) | |||
| _w2 = np.array(self.w2.data()).reshape(self.hidden_f1, -1) | |||
| _b2 = np.array(self.b2.data()).reshape(1, -1) | |||
| _linear1 = np.dot(_x, _w1) + _b1 | |||
| _act1 = np.maximum(0.0, _linear1) | |||
| _linear2 = np.dot(_act1, _w2) + _b2 | |||
| return bias2 | |||
| def get_loss(self, x, y): | |||
| """ | |||
| Computes the loss for a batch of examples. | |||
| Inputs: | |||
| x: a node with shape (batch_size x 1) | |||
| y: a node with shape (batch_size x 1), containing the true y-values | |||
| to be used for training | |||
| Returns: a loss node | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| predict_y = self.run(x) | |||
| return nn.SquareLoss(predict_y, y) | |||
| def train(self, dataset): | |||
| """ | |||
| Trains the model. | |||
| """ | |||
| "*** YOUR CODE HERE ***" | |||
| itera = 0 | |||
| while True: | |||
| for x, y in dataset.iterate_once(self.batch_size): | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1, g_w2, g_b2 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2]) | |||
| self.w1.update(g_w1, self.lr) | |||
| self.b1.update(g_b1, self.lr) | |||
| self.w2.update(g_w2, self.lr) | |||
| self.b2.update(g_b2, self.lr) | |||
| itera += 1 | |||
| if loss.data()[0] < 0.01: | |||
| break | |||
| class RegressionDataset(Dataset): | |||
| def __init__(self, model: RegressionModel): | |||
| x = np.expand_dims(np.linspace(-2 * np.pi, 2 * np.pi, num=200), axis=1) | |||
| np.random.RandomState(0).shuffle(x) | |||
| self.argsort_x = np.argsort(x.flatten()) | |||
| y = np.sin(x) | |||
| super().__init__(x, y) | |||
| self.model = model | |||
| self.processed = 0 | |||
| if use_graphics: | |||
| fig, ax = plt.subplots(1, 1) | |||
| ax.set_xlim(-2 * np.pi, 2 * np.pi) | |||
| ax.set_ylim(-1.4, 1.4) | |||
| real, = ax.plot(x[self.argsort_x], y[self.argsort_x], color="blue") | |||
| learned, = ax.plot([], [], color="red") | |||
| text = ax.text(0.03, 0.97, "", transform=ax.transAxes, va="top") | |||
| ax.legend([real, learned], ["real", "learned"]) | |||
| plt.show(block=False) | |||
| self.fig = fig | |||
| self.learned = learned | |||
| self.text = text | |||
| self.last_update = time.time() | |||
| def iterate_once(self, batch_size): | |||
| for x, y in super().iterate_once(batch_size): | |||
| yield x, y | |||
| self.processed += batch_size | |||
| if time.time() - self.last_update > 0.01: | |||
| predicted = self.model.run(nn.Constant(self.x)).data() | |||
| loss = self.model.get_loss( | |||
| x, y).data() | |||
| predicted = np.array(predicted) | |||
| loss = loss[0] | |||
| print(f"processed: {self.processed}\nloss: {loss: .6f}") | |||
| if use_graphics: | |||
| self.learned.set_data(self.x[self.argsort_x], predicted[self.argsort_x]) | |||
| self.text.set_text(f"processed: {self.processed}\nloss: {loss: .6f}") | |||
| self.fig.canvas.draw_idle() | |||
| self.fig.canvas.start_event_loop(1e-3) | |||
| self.last_update = time.time() | |||
| model = RegressionModel() | |||
| dataset = RegressionDataset(model) | |||
| model.train(dataset) | |||
| @@ -0,0 +1,72 @@ | |||
| import uctc.nn as nn | |||
| import std_model as stdnn | |||
| import numpy as np | |||
| class LinearTestModel: | |||
| def __init__(self, output_features): | |||
| self.b1 = nn.Parameter([1, output_features]) | |||
| def forward(self, x): | |||
| l2 = nn.AddBias(x, self.b1) | |||
| return l2 | |||
| def get_loss(self, x, y): | |||
| return nn.SquareLoss(self.forward(x), y) | |||
| def backward(self, x, y): | |||
| loss = self.get_loss(x, y) | |||
| g_b1 = nn.gradients(loss, [self.b1])[0] | |||
| return g_b1.data() | |||
| class StdLinerTestModel: | |||
| def __init__(self, output_features, tmodel: LinearTestModel): | |||
| self.b1 = stdnn.Parameter(1, output_features) | |||
| self.b1.data = np.array(tmodel.b1.data()).reshape(1, output_features) | |||
| def forward(self, x): | |||
| l2 = stdnn.AddBias(x, self.b1) | |||
| return l2 | |||
| def get_loss(self, x, y): | |||
| return stdnn.SquareLoss(self.forward(x), y) | |||
| def backward(self, x, y): | |||
| loss = self.get_loss(x, y) | |||
| g_b1 = stdnn.gradients(loss, [self.b1])[0] | |||
| return g_b1.data.flatten().tolist() | |||
| output_features = 32 | |||
| batch_size = 4 | |||
| x = np.random.randn(batch_size, output_features).astype(np.float32) | |||
| y = np.random.randn(batch_size, output_features).astype(np.float32) | |||
| model = LinearTestModel(output_features) | |||
| test_x = nn.Constant(x) | |||
| predict_y = model.forward(test_x).data() | |||
| test_y = nn.Constant(y) | |||
| loss = model.get_loss(test_x, test_y).data() | |||
| g_b1 = model.backward(test_x, test_y) | |||
| stdmodel = StdLinerTestModel(output_features, model) | |||
| std_test_x = stdnn.Constant(x) | |||
| std_predict_y = stdmodel.forward(std_test_x) | |||
| std_test_y = stdnn.Constant(y) | |||
| std_loss = stdmodel.get_loss(std_test_x, std_test_y) | |||
| std_g_b1 = stdmodel.backward(std_test_x, std_test_y) | |||
| # check forward | |||
| for x, y in zip(predict_y, std_predict_y.data.tolist()[0]): | |||
| if (abs(x-y) > 1e-4): | |||
| assert 0, "Forward data mismatch!" | |||
| # check loss | |||
| if abs(loss[0] - std_loss.data) > 1e-4: | |||
| assert 0, "Loss mismatch!" | |||
| # check backward | |||
| for i, (x, y) in enumerate(zip(g_b1, std_g_b1)): | |||
| if (abs(x-y) > 1e-4): | |||
| assert 0, f"Gradient b1 mismatch at position {i}, g_b1 is {x} while std g_b1 is {y}" | |||
| print("Test passed") | |||
| @@ -0,0 +1,81 @@ | |||
| import uctc.nn as nn | |||
| import std_model as stdnn | |||
| import numpy as np | |||
| class LinearTestModel: | |||
| def __init__(self, input_features, output_features): | |||
| self.w1 = nn.Parameter([input_features, output_features]) | |||
| self.b1 = nn.Parameter([1, output_features]) | |||
| def forward(self, x): | |||
| l1 = nn.Linear(x, self.w1) | |||
| l2 = nn.AddBias(l1, self.b1) | |||
| return l2 | |||
| def get_loss(self, x, y): | |||
| return nn.SquareLoss(self.forward(x), y) | |||
| def backward(self, x, y): | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1 = nn.gradients(loss, [self.w1, self.b1]) | |||
| return g_w1.data(), g_b1.data() | |||
| class StdLinerTestModel: | |||
| def __init__(self, input_features, output_features, tmodel: LinearTestModel): | |||
| self.w1 = stdnn.Parameter(input_features, output_features) | |||
| self.b1 = stdnn.Parameter(1, output_features) | |||
| self.w1.data = np.array(tmodel.w1.data()).reshape(input_features, output_features) | |||
| self.b1.data = np.array(tmodel.b1.data()).reshape(1, output_features) | |||
| def forward(self, x): | |||
| l1 = stdnn.Linear(x, self.w1) | |||
| l2 = stdnn.AddBias(l1, self.b1) | |||
| return l2 | |||
| def get_loss(self, x, y): | |||
| return stdnn.SquareLoss(self.forward(x), y) | |||
| def backward(self, x, y): | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1 = stdnn.gradients(loss, [self.w1, self.b1]) | |||
| return g_w1.data.flatten().tolist(), g_b1.data.flatten().tolist() | |||
| input_features = 16 | |||
| output_features = 32 | |||
| batch_size = 4 | |||
| x = np.random.randn(batch_size, input_features).astype(np.float32) | |||
| y = np.random.randn(batch_size, output_features).astype(np.float32) | |||
| model = LinearTestModel(input_features, output_features) | |||
| test_x = nn.Constant(x) | |||
| predict_y = model.forward(test_x).data() | |||
| test_y = nn.Constant(y) | |||
| loss = model.get_loss(test_x, test_y).data() | |||
| g_w1, g_b1 = model.backward(test_x, test_y) | |||
| stdmodel = StdLinerTestModel(input_features, output_features, model) | |||
| std_test_x = stdnn.Constant(x) | |||
| std_predict_y = stdmodel.forward(std_test_x) | |||
| std_test_y = stdnn.Constant(y) | |||
| std_loss = stdmodel.get_loss(std_test_x, std_test_y) | |||
| std_g_w1, std_g_b1 = stdmodel.backward(std_test_x, std_test_y) | |||
| # check forward | |||
| for x, y in zip(predict_y, std_predict_y.data.tolist()[0]): | |||
| if (abs(x-y) > 1e-4): | |||
| assert 0, "Forward data mismatch!" | |||
| # check loss | |||
| if abs(loss[0] - std_loss.data) > 1e-4: | |||
| assert 0, "Loss mismatch!" | |||
| # check backward | |||
| for i, (x, y) in enumerate(zip(g_w1, std_g_w1)): | |||
| if (abs(x-y) > 1e-4): | |||
| assert 0, f"Gradient w1 mismatch at position {i}, g_w1 is {x} while std g_w1 is {y}" | |||
| for i, (x, y) in enumerate(zip(g_b1, std_g_b1)): | |||
| if (abs(x-y) > 1e-4): | |||
| assert 0, f"Gradient b1 mismatch at position {i}, g_b1 is {x} while std g_b1 is {y}" | |||
| print("Test passed") | |||
| @@ -0,0 +1,83 @@ | |||
| import uctc.nn as nn | |||
| import std_model as stdnn | |||
| import numpy as np | |||
| class LinearTestModel: | |||
| def __init__(self, input_features, output_features): | |||
| self.w1 = nn.Parameter([input_features, output_features]) | |||
| self.b1 = nn.Parameter([1, output_features]) | |||
| def forward(self, x): | |||
| l1 = nn.Linear(x, self.w1) | |||
| l2 = nn.AddBias(l1, self.b1) | |||
| l3 = nn.ReLU(l2) | |||
| return l3 | |||
| def get_loss(self, x, y): | |||
| return nn.SquareLoss(self.forward(x), y) | |||
| def backward(self, x, y): | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1 = nn.gradients(loss, [self.w1, self.b1]) | |||
| return g_w1.data(), g_b1.data() | |||
| class StdLinerTestModel: | |||
| def __init__(self, input_features, output_features, tmodel: LinearTestModel): | |||
| self.w1 = stdnn.Parameter(input_features, output_features) | |||
| self.b1 = stdnn.Parameter(1, output_features) | |||
| self.w1.data = np.array(tmodel.w1.data()).reshape(input_features, output_features) | |||
| self.b1.data = np.array(tmodel.b1.data()).reshape(1, output_features) | |||
| def forward(self, x): | |||
| l1 = stdnn.Linear(x, self.w1) | |||
| l2 = stdnn.AddBias(l1, self.b1) | |||
| l3 = stdnn.ReLU(l2) | |||
| return l3 | |||
| def get_loss(self, x, y): | |||
| return stdnn.SquareLoss(self.forward(x), y) | |||
| def backward(self, x, y): | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1 = stdnn.gradients(loss, [self.w1, self.b1]) | |||
| return g_w1.data.flatten().tolist(), g_b1.data.flatten().tolist() | |||
| input_features = 16 | |||
| output_features = 32 | |||
| batch_size = 4 | |||
| x = np.random.randn(batch_size, input_features).astype(np.float32) | |||
| y = np.random.randn(batch_size, output_features).astype(np.float32) | |||
| model = LinearTestModel(input_features, output_features) | |||
| test_x = nn.Constant(x) | |||
| predict_y = model.forward(test_x).data() | |||
| test_y = nn.Constant(y) | |||
| loss = model.get_loss(test_x, test_y).data() | |||
| g_w1, g_b1 = model.backward(test_x, test_y) | |||
| stdmodel = StdLinerTestModel(input_features, output_features, model) | |||
| std_test_x = stdnn.Constant(x) | |||
| std_predict_y = stdmodel.forward(std_test_x) | |||
| std_test_y = stdnn.Constant(y) | |||
| std_loss = stdmodel.get_loss(std_test_x, std_test_y) | |||
| std_g_w1, std_g_b1 = stdmodel.backward(std_test_x, std_test_y) | |||
| # check forward | |||
| for x, y in zip(predict_y, std_predict_y.data.tolist()[0]): | |||
| if (abs(x-y) > 1e-4): | |||
| assert 0, "Forward data mismatch!" | |||
| # check loss | |||
| if abs(loss[0] - std_loss.data) > 1e-4: | |||
| assert 0, "Loss mismatch!" | |||
| # check backward | |||
| for i, (x, y) in enumerate(zip(g_w1, std_g_w1)): | |||
| if (abs(x-y) > 1e-4): | |||
| assert 0, f"Gradient w1 mismatch at position {i}, g_w1 is {x} while std g_w1 is {y}" | |||
| for i, (x, y) in enumerate(zip(g_b1, std_g_b1)): | |||
| if (abs(x-y) > 1e-4): | |||
| assert 0, f"Gradient b1 mismatch at position {i}, g_b1 is {x} while std g_b1 is {y}" | |||
| print("Test passed") | |||
| @@ -0,0 +1,144 @@ | |||
| import uctc.nn as nn | |||
| import std_model as stdnn | |||
| import numpy as np | |||
| np.random.seed(42) | |||
| class LinearTestModel: | |||
| def __init__(self, input_features, hidden_features, output_features): | |||
| self.w1 = nn.Parameter([input_features, hidden_features]) | |||
| self.b1 = nn.Parameter([1, hidden_features]) | |||
| self.w2 = nn.Parameter([hidden_features, output_features]) | |||
| self.b2 = nn.Parameter([1, output_features]) | |||
| def forward(self, x): | |||
| l1 = nn.Linear(x, self.w1) | |||
| l2 = nn.AddBias(l1, self.b1) | |||
| l3 = nn.ReLU(l2) | |||
| l4 = nn.Linear(l3, self.w2) | |||
| l5 = nn.AddBias(l4, self.b2) | |||
| return l5 | |||
| def get_loss(self, x, y): | |||
| return nn.SquareLoss(self.forward(x), y) | |||
| def backward(self, x, y): | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1, g_w2, g_b2 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2]) | |||
| return g_w1.data(), g_b1.data(), g_w2.data(), g_b2.data() | |||
| def update(self, x, y, lr): | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1, g_w2, g_b2 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2]) | |||
| self.w1.update(g_w1, lr) | |||
| self.b1.update(g_b1, lr) | |||
| self.w2.update(g_w2, lr) | |||
| self.b2.update(g_b2, lr) | |||
| print(g_w1.data()) | |||
| print(g_b1.data()) | |||
| print(g_w2.data()) | |||
| print(g_b2.data()) | |||
| return self.w1.data(), self.b1.data(), self.w2.data(), self.b2.data() | |||
| class StdLinerTestModel: | |||
| def __init__(self, input_features, hidden_features, output_features, tmodel: LinearTestModel): | |||
| self.w1 = stdnn.Parameter(input_features, hidden_features) | |||
| self.b1 = stdnn.Parameter(1, hidden_features) | |||
| self.w2 = stdnn.Parameter(hidden_features, output_features) | |||
| self.b2 = stdnn.Parameter(1, output_features) | |||
| self.w1.data = np.array(tmodel.w1.data()).reshape(input_features, hidden_features) | |||
| self.b1.data = np.array(tmodel.b1.data()).reshape(1, hidden_features) | |||
| self.w2.data = np.array(tmodel.w2.data()).reshape(hidden_features, output_features) | |||
| self.b2.data = np.array(tmodel.b2.data()).reshape(1, output_features) | |||
| def forward(self, x): | |||
| l1 = stdnn.Linear(x, self.w1) | |||
| l2 = stdnn.AddBias(l1, self.b1) | |||
| l3 = stdnn.ReLU(l2) | |||
| l4 = stdnn.Linear(l3, self.w2) | |||
| l5 = stdnn.AddBias(l4, self.b2) | |||
| return l5 | |||
| def get_loss(self, x, y): | |||
| return stdnn.SquareLoss(self.forward(x), y) | |||
| def backward(self, x, y): | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1, g_w2, g_b2 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2]) | |||
| return g_w1.data.flatten().tolist(), g_b1.data.flatten().tolist(), g_w2.data.flatten().tolist(), g_b2.data.flatten().tolist() | |||
| def update(self, x, y, lr): | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1, g_w2, g_b2 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2]) | |||
| self.w1.update(g_w1, -lr) | |||
| self.b1.update(g_b1, -lr) | |||
| self.w2.update(g_w2, -lr) | |||
| self.b2.update(g_b2, -lr) | |||
| return self.w1.data.flatten().tolist(), self.b1.data.flatten().tolist(), self.w2.data.flatten().tolist(), self.b2.data.flatten().tolist() | |||
| input_features = 1 | |||
| hidden_features = 50 | |||
| output_features = 1 | |||
| batch_size = 10 | |||
| x = np.array([-5.146528720855713, 4.451905250549316, 0.4736069440841675, -0.09472138434648514, 4.8939385414123535, 5.209676265716553, -5.967447280883789, 2.9363629817962646, -5.525413990020752, 3.315248489379883]).reshape(batch_size, -1) | |||
| y = np.array([0.9072322249412537, -0.9662654995918274, 0.45609915256500244, -0.09457980841398239, -0.9835651516914368, -0.8788799047470093, 0.3105180263519287, 0.2037920206785202, 0.6873041391372681, -0.17278438806533813]).reshape(batch_size, -1) | |||
| model = LinearTestModel(input_features, hidden_features, output_features) | |||
| stdmodel = StdLinerTestModel(input_features, hidden_features, output_features, model) | |||
| test_x = nn.Constant(x) | |||
| predict_y = model.forward(test_x).data() | |||
| test_y = nn.Constant(y) | |||
| loss = model.get_loss(test_x, test_y).data() | |||
| g_w1, g_b1, g_w2, g_b2 = model.backward(test_x, test_y) | |||
| new_w1, new_b1, new_w2, new_b2 = model.update(test_x, test_y, 0) | |||
| std_test_x = stdnn.Constant(x) | |||
| std_predict_y = stdmodel.forward(std_test_x) | |||
| std_test_y = stdnn.Constant(y) | |||
| std_loss = stdmodel.get_loss(std_test_x, std_test_y) | |||
| std_g_w1, std_g_b1, std_g_w2, std_g_b2 = stdmodel.backward(std_test_x, std_test_y) | |||
| std_new_w1, std_new_b1, std_new_w2, std_new_b2 = stdmodel.update(std_test_x, std_test_y, 0) | |||
| # print(predict_y) | |||
| # print() | |||
| # print(std_predict_y.data.flatten().tolist()) | |||
| # check forward | |||
| for x, y in zip(predict_y, std_predict_y.data.flatten().tolist()): | |||
| if (abs(x-y) > 1e-4): | |||
| assert 0, "Forward data mismatch!" | |||
| # print(loss, std_loss.data) | |||
| # check loss | |||
| if abs(loss[0] - std_loss.data) > 1e-4: | |||
| assert 0, "Loss mismatch!" | |||
| # check backward | |||
| for i, (x, y) in enumerate(zip(g_w1, std_g_w1)): | |||
| if (abs(x-y) > 1e-4): | |||
| assert 0, f"Gradient w1 mismatch at position {i}, g_w1 is {x} while std g_w1 is {y}" | |||
| for i, (x, y) in enumerate(zip(g_b1, std_g_b1)): | |||
| if (abs(x-y) > 1e-4): | |||
| assert 0, f"Gradient b1 mismatch at position {i}, g_b1 is {x} while std g_b1 is {y}" | |||
| for i, (x, y) in enumerate(zip(g_w2, std_g_w2)): | |||
| if (abs(x-y) > 1e-4): | |||
| assert 0, f"Gradient w2 mismatch at position {i}, g_w2 is {x} while std g_w2 is {y}" | |||
| for i, (x, y) in enumerate(zip(g_b2, std_g_b2)): | |||
| if (abs(x-y) > 1e-4): | |||
| assert 0, f"Gradient b2 mismatch at position {i}, g_b2 is {x} while std g_b2 is {y}" | |||
| # check update | |||
| for i, (x, y) in enumerate(zip(new_b1, std_new_b1)): | |||
| if (abs(x-y) > 1e-4): | |||
| assert 0, f"Updated b1 mismatch at position {i}, new_b1 is {x} while std new_b1 is {y}" | |||
| for i, (x, y) in enumerate(zip(new_w1, std_new_w1)): | |||
| if (abs(x-y) > 1e-4): | |||
| assert 0, f"Updated w1 mismatch at position {i}, new_w1 is {x} while std new_w1 is {y}" | |||
| # for i, (x, y) in enumerate(zip(new_b2, std_new_b2)): | |||
| # if (abs(x-y) > 1e-4): | |||
| # assert 0, f"Updated b2 mismatch at position {i}, new_b2 is {x} while std new_b2 is {y}" | |||
| # for i, (x, y) in enumerate(zip(new_w2, std_new_w2)): | |||
| # if (abs(x-y) > 1e-4): | |||
| # assert 0, f"Updated w2 mismatch at position {i}, new_w2 is {x} while std new_w2 is {y}" | |||
| print("Test passed") | |||
| @@ -0,0 +1,128 @@ | |||
| import uctc.nn as nn | |||
| import std_model as stdnn | |||
| import numpy as np | |||
| np.random.seed(42) | |||
| class LinearTestModel: | |||
| def __init__(self, input_features, hidden_features, output_features): | |||
| self.w1 = nn.Parameter([input_features, hidden_features]) | |||
| self.b1 = nn.Parameter([1, hidden_features]) | |||
| self.w2 = nn.Parameter([hidden_features, output_features]) | |||
| self.b2 = nn.Parameter([1, output_features]) | |||
| def forward(self, x): | |||
| layer_1 = nn.ReLU(nn.AddBias(nn.Linear(x, self.w1), self.b1)) | |||
| prediction = nn.AddBias(nn.Linear(layer_1, self.w2), self.b2) | |||
| # print(f"o1: {prediction.data()[:10]}") | |||
| return prediction | |||
| def get_loss(self, x, y): | |||
| return nn.SquareLoss(self.forward(x), y) | |||
| def backward(self, x, y): | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1, g_w2, g_b2 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2]) | |||
| return g_w1.data(), g_b1.data(), g_w2.data(), g_b2.data() | |||
| def update(self, x, y, lr): | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1, g_w2, g_b2 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2]) | |||
| self.w1.update(g_w1, lr) | |||
| self.b1.update(g_b1, lr) | |||
| self.w2.update(g_w2, lr) | |||
| self.b2.update(g_b2, lr) | |||
| # print(g_w1.data()) | |||
| # print(g_b1.data()) | |||
| # print(g_w2.data()) | |||
| # print(g_b2.data()) | |||
| # return self.w1.data(), self.b1.data(), self.w2.data(), self.b2.data() | |||
| def train(self): | |||
| self.x = np.expand_dims(np.linspace(-2 * np.pi, 2 * np.pi, num=200), axis=1) | |||
| # np.random.RandomState(0).shuffle(self.x) | |||
| self.argsort_x = np.argsort(self.x.flatten()) | |||
| self.y = np.sin(self.x) | |||
| for i in range(epoch): | |||
| np.random.RandomState(0).shuffle(self.x) | |||
| index = 0 | |||
| while index < self.x.shape[0]: | |||
| x = self.x[index:index + batch_size] | |||
| y = self.y[index:index + batch_size] | |||
| cx = nn.Constant(x) | |||
| cy = nn.Constant(y) | |||
| self.update(cx, cy, 0.01) | |||
| index += batch_size | |||
| # break | |||
| loss = self.get_loss(cx,cy) | |||
| print(loss.data()) | |||
| class StdLinerTestModel: | |||
| def __init__(self, input_features, hidden_features, output_features, tmodel: LinearTestModel): | |||
| self.w1 = stdnn.Parameter(input_features, hidden_features) | |||
| self.b1 = stdnn.Parameter(1, hidden_features) | |||
| self.w2 = stdnn.Parameter(hidden_features, output_features) | |||
| self.b2 = stdnn.Parameter(1, output_features) | |||
| # self.w1.data = np.array(tmodel.w1.data()).reshape(input_features, hidden_features) | |||
| # self.b1.data = np.array(tmodel.b1.data()).reshape(1, hidden_features) | |||
| # self.w2.data = np.array(tmodel.w2.data()).reshape(hidden_features, output_features) | |||
| # self.b2.data = np.array(tmodel.b2.data()).reshape(1, output_features) | |||
| # print(self.w1.data) | |||
| def forward(self, x): | |||
| layer_1 = stdnn.ReLU(stdnn.AddBias(stdnn.Linear(x, self.w1), self.b1)) | |||
| prediction = stdnn.AddBias(stdnn.Linear(layer_1, self.w2), self.b2) | |||
| # print(f"o2: {prediction.data.flatten()[:10]}") | |||
| return prediction | |||
| def get_loss(self, x, y): | |||
| return stdnn.SquareLoss(self.forward(x), y) | |||
| def backward(self, x, y): | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1, g_w2, g_b2 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2]) | |||
| return g_w1.data.flatten().tolist(), g_b1.data.flatten().tolist(), g_w2.data.flatten().tolist(), g_b2.data.flatten().tolist() | |||
| def update(self, x, y, lr): | |||
| # loss = self.get_loss(x, y) | |||
| # g_w1, g_b1, g_w2, g_b2 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2]) | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1, g_w2, g_b2 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2]) | |||
| self.w1.update(g_w1, -lr) | |||
| self.b1.update(g_b1, -lr) | |||
| self.w2.update(g_w2, -lr) | |||
| self.b2.update(g_b2, -lr) | |||
| # print(loss.data) | |||
| # return self.w1.data.flatten().tolist(), self.b1.data.flatten().tolist(), self.w2.data.flatten().tolist(), self.b2.data.flatten().tolist() | |||
| def train(self): | |||
| self.x = np.expand_dims(np.linspace(-2 * np.pi, 2 * np.pi, num=200), axis=1) | |||
| self.argsort_x = np.argsort(self.x.flatten()) | |||
| self.y = np.sin(self.x) | |||
| for i in range(epoch): | |||
| # np.random.RandomState(0).shuffle(self.x) | |||
| index = 0 | |||
| while index < self.x.shape[0]: | |||
| x = self.x[index:index + batch_size] | |||
| y = self.y[index:index + batch_size] | |||
| cx = stdnn.Constant(x) | |||
| cy = stdnn.Constant(y) | |||
| self.update(cx, cy, 0.01) | |||
| index += batch_size | |||
| break | |||
| loss = self.get_loss(cx, cy) | |||
| print(loss.data) | |||
| input_features = 1 | |||
| hidden_features = 50 | |||
| output_features = 1 | |||
| batch_size = 10 | |||
| epoch = 1 | |||
| model = LinearTestModel(input_features, hidden_features, output_features) | |||
| smodel = StdLinerTestModel(input_features, hidden_features, output_features, model) | |||
| # model.train() | |||
| smodel.train() | |||
| @@ -0,0 +1,144 @@ | |||
| import uctc.nn as nn | |||
| import std_model as stdnn | |||
| import numpy as np | |||
| from data6 import x, y | |||
| np.random.seed(42) | |||
| def parameter_data(*shape): | |||
| assert len(shape) == 2, ( | |||
| "Shape must have 2 dimensions, instead has {}".format(len(shape))) | |||
| assert all(isinstance(dim, int) and dim > 0 for dim in shape), ( | |||
| "Shape must consist of positive integers, got {!r}".format(shape)) | |||
| limit = np.sqrt(3.0 / np.mean(shape)) | |||
| data = np.random.uniform(low=-limit, high=limit, size=shape).astype(np.float32) | |||
| return data | |||
| class MNISTModel: | |||
| def __init__(self): | |||
| self.input_features = 784 | |||
| self.h1 = 200 | |||
| self.h2 = 100 | |||
| self.output_features = 10 | |||
| self.lr = 0.01 | |||
| self.batch_size = 100 | |||
| self.w1data = parameter_data(self.input_features, self.h1) | |||
| self.b1data = parameter_data(1, self.h1) | |||
| self.w2data = parameter_data(self.h1, self.h2) | |||
| self.b2data = parameter_data(1, self.h2) | |||
| self.w3data = parameter_data(self.h2, self.output_features) | |||
| self.b3data = parameter_data(1, self.output_features) | |||
| self.w1 = nn.Parameter(self.w1data) | |||
| self.b1 = nn.Parameter(self.b1data) | |||
| self.w2 = nn.Parameter(self.w2data) | |||
| self.b2 = nn.Parameter(self.b2data) | |||
| self.w3 = nn.Parameter(self.w3data) | |||
| self.b3 = nn.Parameter(self.b3data) | |||
| def run(self, x): | |||
| l1 = nn.ReLU(nn.AddBias(nn.Linear(x, self.w1), self.b1)) | |||
| l2 = nn.ReLU(nn.AddBias(nn.Linear(l1, self.w2), self.b2)) | |||
| l3 = nn.AddBias(nn.Linear(l2, self.w3), self.b3) | |||
| return l3 | |||
| def get_loss(self, x, y): | |||
| return nn.SoftmaxLoss(self.run(x), y) | |||
| def train(self, x, y): | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1, g_w2, g_b2, g_w3, g_b3 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2, self.w3, self.b3]) | |||
| self.w1.update(g_w1, self.lr) | |||
| self.b1.update(g_b1, self.lr) | |||
| self.w2.update(g_w2, self.lr) | |||
| self.b2.update(g_b2, self.lr) | |||
| self.w3.update(g_w3, self.lr) | |||
| self.b3.update(g_b3, self.lr) | |||
| return g_w1.data(), g_b1.data(), g_w2.data(), g_b2.data(), g_w3.data(), g_b3.data() | |||
| class StdMNISTModel: | |||
| def __init__(self, model: MNISTModel): | |||
| self.input_features = 784 | |||
| self.h1 = 200 | |||
| self.h2 = 100 | |||
| self.output_features = 10 | |||
| self.lr = 0.01 | |||
| self.batch_size = 100 | |||
| self.w1 = stdnn.Parameter(self.input_features, self.h1) | |||
| self.w1.data = model.w1data | |||
| self.b1 = stdnn.Parameter(1, self.h1) | |||
| self.b1.data = model.b1data | |||
| self.w2 = stdnn.Parameter(self.h1, self.h2) | |||
| self.w2.data = model.w2data | |||
| self.b2 = stdnn.Parameter(1, self.h2) | |||
| self.b2.data = model.b2data | |||
| self.w3 = stdnn.Parameter(self.h2, self.output_features) | |||
| self.w3.data = model.w3data | |||
| self.b3 = stdnn.Parameter(1, self.output_features) | |||
| self.b3.data = model.b3data | |||
| def run(self, x): | |||
| l1 = stdnn.ReLU(stdnn.AddBias(stdnn.Linear(x, self.w1), self.b1)) | |||
| l2 = stdnn.ReLU(stdnn.AddBias(stdnn.Linear(l1, self.w2), self.b2)) | |||
| l3 = stdnn.AddBias(stdnn.Linear(l2, self.w3), self.b3) | |||
| return l3 | |||
| def get_loss(self, x, y): | |||
| return stdnn.SoftmaxLoss(self.run(x), y) | |||
| def train(self, x, y): | |||
| loss = self.get_loss(x, y) | |||
| g_w1, g_b1, g_w2, g_b2, g_w3, g_b3 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2, self.w3, self.b3]) | |||
| self.w1.update(g_w1, -self.lr) | |||
| self.b1.update(g_b1, -self.lr) | |||
| self.w2.update(g_w2, -self.lr) | |||
| self.b2.update(g_b2, -self.lr) | |||
| self.w3.update(g_w3, -self.lr) | |||
| self.b3.update(g_b3, -self.lr) | |||
| return g_w1.data.flatten().tolist(), g_b1.data.flatten().tolist(), g_w2.data.flatten().tolist(), g_b2.data.flatten().tolist(), g_w3.data.flatten().tolist(), g_b3.data.flatten().tolist() | |||
| model = MNISTModel() | |||
| smodel = StdMNISTModel(model) | |||
| o1_x = nn.Constant(x) | |||
| o1_y = nn.Constant(y) | |||
| o1_out = model.run(o1_x).data() | |||
| print(o1_out) | |||
| # o1_loss = model.get_loss(o1_x, o1_y) | |||
| # print(o1_loss.data()[0]) | |||
| # o1_gw1, o1_gb1, o1_gw2, o1_gb2, o1_gw3, o1_gb3 = model.train(o1_x, o1_y) | |||
| o2_x = stdnn.Constant(x) | |||
| o2_y = stdnn.Constant(y) | |||
| o2_out = smodel.run(o2_x).data | |||
| print(o2_out) | |||
| # o2_loss = smodel.get_loss(o2_x, o2_y) | |||
| # print(o2_loss.data) | |||
| # o2_gw1, o2_gb1, o2_gw2, o2_gb2, o2_gw3, o2_gb3 = smodel.train(o2_x, o2_y) | |||
| # for i, (a, b) in enumerate(zip(o1_gw1, o2_gw1)): | |||
| # if abs(a - b) > 1e-4: | |||
| # print(f"gw1 failed: {i, a, b}") | |||
| # break | |||
| # for i, (a, b) in enumerate(zip(o1_gb1, o2_gb1)): | |||
| # if abs(a - b) > 1e-4: | |||
| # print(f"gb1 failed: {i, a, b}") | |||
| # break | |||
| # for i, (a, b) in enumerate(zip(o1_gw2, o2_gw2)): | |||
| # if abs(a - b) > 1e-4: | |||
| # print(f"gw2 failed: {i, a, b}") | |||
| # break | |||
| # for i, (a, b) in enumerate(zip(o1_gb2, o2_gb2)): | |||
| # if abs(a - b) > 1e-4: | |||
| # print(f"gb2 failed: {i, a, b}") | |||
| # break | |||
| # for i, (a, b) in enumerate(zip(o1_gw3, o2_gw3)): | |||
| # if abs(a - b) > 1e-4: | |||
| # print(f"gw3 failed: {i, a, b}") | |||
| # break | |||
| # for i, (a, b) in enumerate(zip(o1_gb3, o2_gb3)): | |||
| # if abs(a - b) > 1e-4: | |||
| # print(f"gb3 failed: {i, a, b}") | |||
| # break | |||
| # print(o1_loss.data()[0], o2_loss.data) | |||
| print("PASSED") | |||
| @@ -0,0 +1,393 @@ | |||
| import numpy as np | |||
| def format_shape(shape): | |||
| return "x".join(map(str, shape)) if shape else "()" | |||
| class Node(object): | |||
| def __repr__(self): | |||
| return "<{} shape={} at {}>".format( | |||
| type(self).__name__, format_shape(self.data.shape), hex(id(self))) | |||
| class DataNode(Node): | |||
| """ | |||
| DataNode is the parent class for Parameter and Constant nodes. | |||
| You should not need to use this class directly. | |||
| """ | |||
| def __init__(self, data): | |||
| self.parents = [] | |||
| self.data = data | |||
| def _forward(self, *inputs): | |||
| return self.data | |||
| @staticmethod | |||
| def _backward(gradient, *inputs): | |||
| return [] | |||
| class Parameter(DataNode): | |||
| """ | |||
| A Parameter node stores parameters used in a neural network (or perceptron). | |||
| Use the the `update` method to update parameters when training the | |||
| perceptron or neural network. | |||
| """ | |||
| def __init__(self, *shape): | |||
| assert len(shape) == 2, ( | |||
| "Shape must have 2 dimensions, instead has {}".format(len(shape))) | |||
| assert all(isinstance(dim, int) and dim > 0 for dim in shape), ( | |||
| "Shape must consist of positive integers, got {!r}".format(shape)) | |||
| limit = np.sqrt(3.0 / np.mean(shape)) | |||
| data = np.random.uniform(low=-limit, high=limit, size=shape) | |||
| super().__init__(data) | |||
| def update(self, direction, multiplier): | |||
| assert isinstance(direction, Constant), ( | |||
| "Update direction must be a {} node, instead has type {!r}".format( | |||
| Constant.__name__, type(direction).__name__)) | |||
| assert direction.data.shape == self.data.shape, ( | |||
| "Update direction shape {} does not match parameter shape " | |||
| "{}".format( | |||
| format_shape(direction.data.shape), | |||
| format_shape(self.data.shape))) | |||
| assert isinstance(multiplier, (int, float)), ( | |||
| "Multiplier must be a Python scalar, instead has type {!r}".format( | |||
| type(multiplier).__name__)) | |||
| self.data += multiplier * direction.data | |||
| assert np.all(np.isfinite(self.data)), ( | |||
| "Parameter contains NaN or infinity after update, cannot continue") | |||
| class Constant(DataNode): | |||
| """ | |||
| A Constant node is used to represent: | |||
| * Input features | |||
| * Output labels | |||
| * Gradients computed by back-propagation | |||
| You should not need to construct any Constant nodes directly; they will | |||
| instead be provided by either the dataset or when you call `nn.gradients`. | |||
| """ | |||
| def __init__(self, data): | |||
| assert isinstance(data, np.ndarray), ( | |||
| "Data should be a numpy array, instead has type {!r}".format( | |||
| type(data).__name__)) | |||
| assert np.issubdtype(data.dtype, np.floating), ( | |||
| "Data should be a float array, instead has data type {!r}".format( | |||
| data.dtype)) | |||
| super().__init__(data) | |||
| class FunctionNode(Node): | |||
| """ | |||
| A FunctionNode represents a value that is computed based on other nodes. | |||
| The FunctionNode class performs necessary book-keeping to compute gradients. | |||
| """ | |||
| def __init__(self, *parents): | |||
| assert all(isinstance(parent, Node) for parent in parents), ( | |||
| "Inputs must be node objects, instead got types {!r}".format( | |||
| tuple(type(parent).__name__ for parent in parents))) | |||
| self.parents = parents | |||
| self.data = self._forward(*(parent.data for parent in parents)) | |||
| class Add(FunctionNode): | |||
| """ | |||
| Adds matrices element-wise. | |||
| Usage: nn.Add(x, y) | |||
| Inputs: | |||
| x: a Node with shape (batch_size x num_features) | |||
| y: a Node with the same shape as x | |||
| Output: | |||
| a Node with shape (batch_size x num_features) | |||
| """ | |||
| @staticmethod | |||
| def _forward(*inputs): | |||
| assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) | |||
| assert inputs[0].ndim == 2, ( | |||
| "First input should have 2 dimensions, instead has {}".format( | |||
| inputs[0].ndim)) | |||
| assert inputs[1].ndim == 2, ( | |||
| "Second input should have 2 dimensions, instead has {}".format( | |||
| inputs[1].ndim)) | |||
| assert inputs[0].shape == inputs[1].shape, ( | |||
| "Input shapes should match, instead got {} and {}".format( | |||
| format_shape(inputs[0].shape), format_shape(inputs[1].shape))) | |||
| return inputs[0] + inputs[1] | |||
| @staticmethod | |||
| def _backward(gradient, *inputs): | |||
| assert gradient.shape == inputs[0].shape | |||
| return [gradient, gradient] | |||
| class AddBias(FunctionNode): | |||
| """ | |||
| Adds a bias vector to each feature vector | |||
| Usage: nn.AddBias(features, bias) | |||
| Inputs: | |||
| features: a Node with shape (batch_size x num_features) | |||
| bias: a Node with shape (1 x num_features) | |||
| Output: | |||
| a Node with shape (batch_size x num_features) | |||
| """ | |||
| @staticmethod | |||
| def _forward(*inputs): | |||
| assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) | |||
| assert inputs[0].ndim == 2, ( | |||
| "First input should have 2 dimensions, instead has {}".format( | |||
| inputs[0].ndim)) | |||
| assert inputs[1].ndim == 2, ( | |||
| "Second input should have 2 dimensions, instead has {}".format( | |||
| inputs[1].ndim)) | |||
| assert inputs[1].shape[0] == 1, ( | |||
| "First dimension of second input should be 1, instead got shape " | |||
| "{}".format(format_shape(inputs[1].shape))) | |||
| assert inputs[0].shape[1] == inputs[1].shape[1], ( | |||
| "Second dimension of inputs should match, instead got shapes {} " | |||
| "and {}".format( | |||
| format_shape(inputs[0].shape), format_shape(inputs[1].shape))) | |||
| return inputs[0] + inputs[1] | |||
| @staticmethod | |||
| def _backward(gradient, *inputs): | |||
| assert gradient.shape == inputs[0].shape | |||
| return [gradient, np.sum(gradient, axis=0, keepdims=True)] | |||
| class DotProduct(FunctionNode): | |||
| """ | |||
| Batched dot product | |||
| Usage: nn.DotProduct(features, weights) | |||
| Inputs: | |||
| features: a Node with shape (batch_size x num_features) | |||
| weights: a Node with shape (1 x num_features) | |||
| Output: a Node with shape (batch_size x 1) | |||
| """ | |||
| @staticmethod | |||
| def _forward(*inputs): | |||
| assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) | |||
| assert inputs[0].ndim == 2, ( | |||
| "First input should have 2 dimensions, instead has {}".format( | |||
| inputs[0].ndim)) | |||
| assert inputs[1].ndim == 2, ( | |||
| "Second input should have 2 dimensions, instead has {}".format( | |||
| inputs[1].ndim)) | |||
| assert inputs[1].shape[0] == 1, ( | |||
| "First dimension of second input should be 1, instead got shape " | |||
| "{}".format(format_shape(inputs[1].shape))) | |||
| assert inputs[0].shape[1] == inputs[1].shape[1], ( | |||
| "Second dimension of inputs should match, instead got shapes {} " | |||
| "and {}".format( | |||
| format_shape(inputs[0].shape), format_shape(inputs[1].shape))) | |||
| return np.dot(inputs[0], inputs[1].T) | |||
| @staticmethod | |||
| def _backward(gradient, *inputs): | |||
| # assert gradient.shape[0] == inputs[0].shape[0] | |||
| # assert gradient.shape[1] == 1 | |||
| # return [np.dot(gradient, inputs[1]), np.dot(gradient.T, inputs[0])] | |||
| raise NotImplementedError( | |||
| "Backpropagation through DotProduct nodes is not needed in this " | |||
| "assignment") | |||
| class Linear(FunctionNode): | |||
| """ | |||
| Applies a linear transformation (matrix multiplication) to the input | |||
| Usage: nn.Linear(features, weights) | |||
| Inputs: | |||
| features: a Node with shape (batch_size x input_features) | |||
| weights: a Node with shape (input_features x output_features) | |||
| Output: a node with shape (batch_size x input_features) | |||
| """ | |||
| @staticmethod | |||
| def _forward(*inputs): | |||
| assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) | |||
| assert inputs[0].ndim == 2, ( | |||
| "First input should have 2 dimensions, instead has {}".format( | |||
| inputs[0].ndim)) | |||
| assert inputs[1].ndim == 2, ( | |||
| "Second input should have 2 dimensions, instead has {}".format( | |||
| inputs[1].ndim)) | |||
| assert inputs[0].shape[1] == inputs[1].shape[0], ( | |||
| "Second dimension of first input should match first dimension of " | |||
| "second input, instead got shapes {} and {}".format( | |||
| format_shape(inputs[0].shape), format_shape(inputs[1].shape))) | |||
| return np.dot(inputs[0], inputs[1]) | |||
| @staticmethod | |||
| def _backward(gradient, *inputs): | |||
| assert gradient.shape[0] == inputs[0].shape[0] | |||
| assert gradient.shape[1] == inputs[1].shape[1] | |||
| return [np.dot(gradient, inputs[1].T), np.dot(inputs[0].T, gradient)] | |||
| class ReLU(FunctionNode): | |||
| """ | |||
| An element-wise Rectified Linear Unit nonlinearity: max(x, 0). | |||
| This nonlinearity replaces all negative entries in its input with zeros. | |||
| Usage: nn.ReLU(x) | |||
| Input: | |||
| x: a Node with shape (batch_size x num_features) | |||
| Output: a Node with the same shape as x, but no negative entries | |||
| """ | |||
| @staticmethod | |||
| def _forward(*inputs): | |||
| assert len(inputs) == 1, "Expected 1 input, got {}".format(len(inputs)) | |||
| assert inputs[0].ndim == 2, ( | |||
| "Input should have 2 dimensions, instead has {}".format( | |||
| inputs[0].ndim)) | |||
| return np.maximum(inputs[0], 0) | |||
| @staticmethod | |||
| def _backward(gradient, *inputs): | |||
| assert gradient.shape == inputs[0].shape | |||
| return [gradient * np.where(inputs[0] > 0, 1.0, 0.0)] | |||
| class SquareLoss(FunctionNode): | |||
| """ | |||
| This node first computes 0.5 * (a[i,j] - b[i,j])**2 at all positions (i,j) | |||
| in the inputs, which creates a (batch_size x dim) matrix. It then calculates | |||
| and returns the mean of all elements in this matrix. | |||
| Usage: nn.SquareLoss(a, b) | |||
| Inputs: | |||
| a: a Node with shape (batch_size x dim) | |||
| b: a Node with shape (batch_size x dim) | |||
| Output: a scalar Node (containing a single floating-point number) | |||
| """ | |||
| @staticmethod | |||
| def _forward(*inputs): | |||
| assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) | |||
| assert inputs[0].ndim == 2, ( | |||
| "First input should have 2 dimensions, instead has {}".format( | |||
| inputs[0].ndim)) | |||
| assert inputs[1].ndim == 2, ( | |||
| "Second input should have 2 dimensions, instead has {}".format( | |||
| inputs[1].ndim)) | |||
| assert inputs[0].shape == inputs[1].shape, ( | |||
| "Input shapes should match, instead got {} and {}".format( | |||
| format_shape(inputs[0].shape), format_shape(inputs[1].shape))) | |||
| return np.mean(np.square(inputs[0] - inputs[1]) / 2) | |||
| @staticmethod | |||
| def _backward(gradient, *inputs): | |||
| assert np.asarray(gradient).ndim == 0 | |||
| return [ | |||
| gradient * (inputs[0] - inputs[1]) / inputs[0].size, | |||
| gradient * (inputs[1] - inputs[0]) / inputs[0].size | |||
| ] | |||
| class SoftmaxLoss(FunctionNode): | |||
| """ | |||
| A batched softmax loss, used for classification problems. | |||
| IMPORTANT: do not swap the order of the inputs to this node! | |||
| Usage: nn.SoftmaxLoss(logits, labels) | |||
| Inputs: | |||
| logits: a Node with shape (batch_size x num_classes). Each row | |||
| represents the scores associated with that example belonging to a | |||
| particular class. A score can be an arbitrary real number. | |||
| labels: a Node with shape (batch_size x num_classes) that encodes the | |||
| correct labels for the examples. All entries must be non-negative | |||
| and the sum of values along each row should be 1. | |||
| Output: a scalar Node (containing a single floating-point number) | |||
| """ | |||
| @staticmethod | |||
| def log_softmax(logits): | |||
| log_probs = logits - np.max(logits, axis=1, keepdims=True) | |||
| log_probs -= np.log(np.sum(np.exp(log_probs), axis=1, keepdims=True)) | |||
| return log_probs | |||
| @staticmethod | |||
| def _forward(*inputs): | |||
| assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) | |||
| assert inputs[0].ndim == 2, ( | |||
| "First input should have 2 dimensions, instead has {}".format( | |||
| inputs[0].ndim)) | |||
| assert inputs[1].ndim == 2, ( | |||
| "Second input should have 2 dimensions, instead has {}".format( | |||
| inputs[1].ndim)) | |||
| assert inputs[0].shape == inputs[1].shape, ( | |||
| "Input shapes should match, instead got {} and {}".format( | |||
| format_shape(inputs[0].shape), format_shape(inputs[1].shape))) | |||
| assert np.all(inputs[1] >= 0), ( | |||
| "All entries in the labels input must be non-negative") | |||
| assert np.allclose(np.sum(inputs[1], axis=1), 1), ( | |||
| "Labels input must sum to 1 along each row") | |||
| log_probs = SoftmaxLoss.log_softmax(inputs[0]) | |||
| return np.mean(-np.sum(inputs[1] * log_probs, axis=1)) | |||
| @staticmethod | |||
| def _backward(gradient, *inputs): | |||
| assert np.asarray(gradient).ndim == 0 | |||
| log_probs = SoftmaxLoss.log_softmax(inputs[0]) | |||
| return [ | |||
| gradient * (np.exp(log_probs) - inputs[1]) / inputs[0].shape[0], | |||
| gradient * -log_probs / inputs[0].shape[0] | |||
| ] | |||
| def gradients(loss, parameters): | |||
| """ | |||
| Computes and returns the gradient of the loss with respect to the provided | |||
| parameters. | |||
| Usage: nn.gradients(loss, parameters) | |||
| Inputs: | |||
| loss: a SquareLoss or SoftmaxLoss node | |||
| parameters: a list (or iterable) containing Parameter nodes | |||
| Output: a list of Constant objects, representing the gradient of the loss | |||
| with respect to each provided parameter. | |||
| """ | |||
| assert isinstance(loss, (SquareLoss, SoftmaxLoss)), ( | |||
| "Loss must be a loss node, instead has type {!r}".format( | |||
| type(loss).__name__)) | |||
| assert all(isinstance(parameter, Parameter) for parameter in parameters), ( | |||
| "Parameters must all have type {}, instead got types {!r}".format( | |||
| Parameter.__name__, | |||
| tuple(type(parameter).__name__ for parameter in parameters))) | |||
| assert not hasattr(loss, "used"), ( | |||
| "Loss node has already been used for backpropagation, cannot reuse") | |||
| loss.used = True | |||
| nodes = set() | |||
| tape = [] | |||
| def visit(node): | |||
| if node not in nodes: | |||
| for parent in node.parents: | |||
| visit(parent) | |||
| nodes.add(node) | |||
| tape.append(node) | |||
| visit(loss) | |||
| nodes |= set(parameters) | |||
| grads = {node: np.zeros_like(node.data) for node in nodes} | |||
| grads[loss] = 1.0 | |||
| for node in reversed(tape): | |||
| parent_grads = node._backward( | |||
| grads[node], *(parent.data for parent in node.parents)) | |||
| for parent, parent_grad in zip(node.parents, parent_grads): | |||
| grads[parent] += parent_grad | |||
| return [Constant(grads[parameter]) for parameter in parameters] | |||
| def as_scalar(node): | |||
| """ | |||
| Returns the value of a Node as a standard Python number. This only works | |||
| for nodes with one element (e.g. SquareLoss and SoftmaxLoss, as well as | |||
| DotProduct with a batch size of 1 element). | |||
| """ | |||
| assert isinstance(node, Node), ( | |||
| "Input must be a node object, instead has type {!r}".format( | |||
| type(node).__name__)) | |||
| assert node.data.size == 1, ( | |||
| "Node has shape {}, cannot convert to a scalar".format( | |||
| format_shape(node.data.shape))) | |||
| node.data = node.data.flatten() | |||
| return node.data.tolist()[0] | |||
| @@ -0,0 +1,45 @@ | |||
| import numpy as np | |||
| import uctc.nn as nn | |||
| np.random.seed(42) | |||
| def parameter_data(*shape): | |||
| assert len(shape) == 2, ( | |||
| "Shape must have 2 dimensions, instead has {}".format(len(shape))) | |||
| assert all(isinstance(dim, int) and dim > 0 for dim in shape), ( | |||
| "Shape must consist of positive integers, got {!r}".format(shape)) | |||
| limit = np.sqrt(3.0 / np.mean(shape)) | |||
| data = np.random.uniform(low=-limit, high=limit, size=shape).astype(np.float32) | |||
| return data | |||
| class Dataset(object): | |||
| def __init__(self, x, y): | |||
| assert isinstance(x, np.ndarray) | |||
| assert isinstance(y, np.ndarray) | |||
| assert np.issubdtype(x.dtype, np.floating) | |||
| assert np.issubdtype(y.dtype, np.floating) | |||
| assert x.ndim == 2 | |||
| assert y.ndim == 2 | |||
| assert x.shape[0] == y.shape[0] | |||
| self.x = x | |||
| self.y = y | |||
| def iterate_once(self, batch_size): | |||
| assert isinstance(batch_size, int) and batch_size > 0, ( | |||
| f"Batch size should be a positive integer, got {batch_size}") | |||
| assert self.x.shape[0] % batch_size == 0, ( | |||
| f"Dataset size {self.x.shape[0]} is not divisible by batch size {batch_size}") | |||
| index = 0 | |||
| while index < self.x.shape[0]: | |||
| x = self.x[index:index + batch_size] | |||
| y = self.y[index:index + batch_size] | |||
| yield nn.Constant(x), nn.Constant(y) | |||
| index += batch_size | |||
| def iterate_forever(self, batch_size): | |||
| while True: | |||
| yield from self.iterate_once(batch_size) | |||
| def get_validation_accuracy(self): | |||
| raise NotImplementedError( | |||
| "No validation data is available for this dataset. " | |||
| "In this assignment, only the Digit Classification and Language " | |||
| "Identification datasets have validation data.") | |||
| @@ -0,0 +1,36 @@ | |||
| ### Welcome to uct lab | |||
| > uct 是Undergraduate Computing Torch的简写。 | |||
| 欢迎你选择uct作为自己的大实验,在这个大实验中,我们将亲自动手使用C++搭建一个机器学习框架,并完成手写体数据集MNIST的识别。 | |||
| 注意:你不需要获得任何对于神经网络的前置知识,考虑到《大学计算(下)》面向的是本科一年级学生,我们设计了非常详细的实验指导书帮助你完成这个实验。 | |||
| #### 安装构建工具 | |||
| 大型的C++项目显然不止是几个文件,而是成百上千个文件,因此我们需要一个工具来管理这些文件。有很多课程会使用到类似的工具(在《操作系统》课程上,你将会遇见Makefile;在《编译原理》、《并行编译与优化》上,你将会用到CMake),在这里我们选择CMake。 | |||
| > CMake 是一个开源的跨平台构建系统生成工具,广泛用于管理软件构建过程。它通过生成标准的构建文件(如 Makefile、Visual Studio 项目文件等)来简化跨平台项目的构建流程。 | |||
| > 对于经验丰富的同学,如果你喜欢使用别的构建工具(例如Bazel)也是可以的~ | |||
| 假如你也正在使用WSL(2),运行下面的命令可以安装好所需要的工具和库 | |||
| ```bash | |||
| sudo apt update | |||
| sudo apt install -y build-essential cmake git gcc g++ | |||
| ``` | |||
| #### 准备Python环境 | |||
| 首先,你需要在Linux下具备Python环境。相信在《大学计算(上)》中,你已经具备这样的技能。我们以使用WSL+VSCode为例介绍环境配置的具体方案。 | |||
| 在VSCode中连接WSL,打开对应目录。 | |||
| 使用`conda`创建一个环境(或使用已有环境),然后执行 | |||
| ``` | |||
| pip install pybind11 | |||
| ``` | |||
| 而后,通过`pip show pybind11`可以找到`pybind11`的安装路径,将对应的头文件路径添加到`.vscode/c_cpp_properties.json`的`includePath`中。 | |||
| @@ -0,0 +1,117 @@ | |||
| ### 第一部分:基本操作 | |||
| #### 基本函数的构建 | |||
| 在这一部分中,我们将完成基本的四则运算和由它们组合而成的初等函数的构建。你需要在cc/operators中补全`ops.h`和`ops.cc`的内容。 | |||
| **[TASK 1]** 在`ops.h`中,你需要补全以下函数的实现: | |||
| - `mul`函数,输入为两个数`a`、`b`,输出为它们的乘积。 | |||
| - `id`函数,将输入原样输出。 | |||
| - `add`函数,输入为两个数`a`、`b`,输出为它们的和。 | |||
| - `neg`函数,输入为`a`,输出为`-a`。 | |||
| - `lt`函数,输入为两个数`a`、`b`,输出为`(float)(a < b)`。 | |||
| - `eq`函数,输入为两个数`a`、`b`,输出为`(float)(a == b)`。 | |||
| - `max`函数,输入为两个数`a`、`b`,输出为`a`和`b`中较大的那个。 | |||
| 它们都是模板函数,相信你已经注意到了,它们都被定义在`.h`文件中,而不是`.cc`文件中,这与C++的模板的实例化机制和编译模型有关。 | |||
| 模板的实例化机制:模板函数或模板类并不是真正的代码,而是一个“蓝图”或“模式”,编译器在编译时根据这个蓝图生成具体的代码。这个过程称为模板实例化。例如,当你使用一个模板函数时,编译器会根据你传递的类型参数生成一个具体的函数版本。这个生成的过程发生在编译时。 | |||
| 编译模型:C++采用的是分离编译模型,即每个源文件(.cc 或 .cpp 文件)是独立编译的。编译器在编译一个源文件时,只会看到该源文件及其包含的头文件中的内容。如果你将模板函数的定义放在源文件中,其他源文件在编译时无法看到模板的定义,因此无法生成对应的实例化代码。 | |||
| 另外,你应当还注意到了我们为这两个文件提供了名叫`operators`的命名空间(namespace)。主要是为了防止不同命名空间中的重名冲突。 | |||
| **[TASK 2]** 在`ops.cc`中,你需要完成以下函数的实现: | |||
| - `is_close`函数,输入为两个数`x`、`y`,输出为`(float)(abs(x - y) < epsilon)`。 | |||
| - `sigmoid`函数,输入为`x`,为了方便计算,在输出时遵照下面的规则: | |||
| $$ | |||
| f(x) =\left\{\begin{matrix} | |||
| \frac{1.0}{(1.0 + e^{-x})}, x\ge 0 | |||
| \\ | |||
| \frac{e^x}{(1.0 + e^{x})}, \mathrm{otherwise} | |||
| \end{matrix}\right. | |||
| $$ | |||
| - `relu`函数,输入为`x`,输出为`x > 0.0 ? x : 0.0`。 | |||
| - `inv`函数,输入为`x`,输出为`1.0 / x`。 | |||
| - `inv_back`函数,用于计算$f(x)=\frac{1}{x}$的微分$f(x)\mathrm{d}x$,输入为`x`和`d`,输出为$-\frac{d}{x^2}$。 | |||
| - `relu_back`函数,输入为`x`和`d`,输出为`x > 0.0 ? d*1.0 : 0.0`。 | |||
| #### 函数式编程基础 | |||
| 实现`map`、`zipWith`和`reduce`。 | |||
| `map`接受一个`std::vector`和一个函数作为输入,返回一个新的`std::vector`,其中每个元素都是输入函数应用于输入`std::vector`中对应元素的结果。具体来说,对于下面这个实现: | |||
| ```cpp | |||
| template<typename T, typename F> | |||
| auto map(const std::vector<T>& vec, F func) -> std::vector<decltype(func(std::declval<T>()))> { | |||
| std::vector<decltype(func(std::declval<T>()))> result; | |||
| result.reserve(vec.size()); | |||
| std::transform(vec.begin(), vec.end(), std::back_inserter(result), func); | |||
| return result; | |||
| } | |||
| ``` | |||
| 有几处可能让你感到疑惑的地方。 | |||
| 首先,这里的函数返回值居然和Python一样被后置了!`->` 是 C++11 引入的尾置返回类型语法。它的作用是将函数的返回类型放在函数参数列表之后,而不是放在函数名之前。在某些情况下,返回类型可能依赖于函数参数或模板参数,而这些信息在函数名之前是不可用的。尾置返回类型允许我们在函数参数列表之后推导返回类型。 | |||
| > 例如,在`map`函数中,返回类型依赖于`func`的返回类型,而`func`的类型在函数名之前是未知的。使用尾置返回类型可以解决这个问题。 | |||
| 其次,我们使用了`std::declval`。`std::declval`是 C++11 引入的一个工具,用于在编译时模拟一个对象的“假实例”,以便在不实际构造对象的情况下推导类型。 | |||
| ```cpp | |||
| decltype(func(std::declval<T>())) | |||
| ``` | |||
| > 在`map`函数中,我们需要推导`func`的返回类型。假设`func`是一个函数对象,接受`T`类型的参数并返回某种类型`R`,我们可以使用`std::declval`来模拟调用`func`的过程。 | |||
| **[TASK 3]** 在`ops.cc`中,调用我们给出的`map`函数实现和你刚刚完成的`neg`函数,补全`negList`函数(大约需要1行代码)。 | |||
| **[TASK 4]** 在`ops.h`中,仿照`map`函数,补全`zipWith`函数(大约需要10行代码)。`zipWidth`函数接受两个`vector`和一个函数`func`作为输入,要得到一个新的`vector`,这个`vector`中的元素都是两个`vector`逐元素进行函数`func`操作之后的结果。例如,对于`vec1 = [1, 2, 3]`,`vec2 = [5, 6, 7]`,`func`为`add`,那么将返回`[6, 8, 10]`。注意:在进行`zipWith`函数的实现时,你需要考虑输入的两个`std::vector`长度不一致的情况,对于这种情况,你简单地`throw`一个异常即可。 | |||
| **[TASK 5]** 在`ops.cc`中,使用你实现的`zipWith`和`add`函数,实现`addLists`函数(大约需要1行代码)。 | |||
| **[TASK 6]** 实际上你会发现`std::accumulate`(问一问LLM这个是个啥)就能够承担`reduce`函数的功能,因此你可以直接使用`std::accumulate`来实现`reduce`函数。这个任务需要你使用`reduce`函数实现`sumList`(将一个列表中的元素相加)和`prodList`(将一个列表中的元素相乘)函数(大约分别需要1行代码)。 | |||
| #### 检查结果 | |||
| 做完了?很好,切换到`cc`,执行下面的语句来编译框架 | |||
| ``` | |||
| cmake -S . -B build | |||
| cd build | |||
| make | |||
| ``` | |||
| 现在,编辑系统环境变量 | |||
| ``` | |||
| echo 'export PYTHONPATH="??????"' >> ~/.bashrc | |||
| ``` | |||
| 将??????替换为将刚刚生成的`build`文件夹的绝对目录直接粘贴到这里,这个文件夹的目录应该形如 | |||
| ```Python | |||
| /home/hexu/learn/uc-modern-cpp-student/cc/build | |||
| ``` | |||
| > 可以切换到`build`目录下,执行`pwd`命令来获取绝对路径。 | |||
| 好了,不出意外的话,就再也别动`~/.bashrc`了。现在还有一个`frontend/framework/basis/test_task1.py`文件。切换到目录`frontend/framework/basis/`,直接运行task1到task6的文件,如果没有任何报错,说明你已经完成了这一关!🎉 | |||
| @@ -0,0 +1,56 @@ | |||
| ### 第二部分:自动微分 | |||
| #### 数值微分 | |||
| 有时候,我们无需知道一个函数具体的表达式,借助导数的定义,利用计算机可以求解出在某一点的导数值。这种方法称为数值微分。举个例子,对于任何一个$f(x)$,我们当然可以根据定义求出其在$x=x_0$处的导数,即 | |||
| $$f'(x)|_{x=x_0} = \frac{f(x_0+\varepsilon)-f(x_0 - \varepsilon)}{2\varepsilon }$$ | |||
| 其中$\varepsilon$是一个很小的正数。但是,如果$f(x)$的表达式非常复杂,那么我们可能无法直接求出导数。此时,我们可以借助数值微分来求解导数值。下面我们以$f(x)=x^2$为例,演示如何使用数值微分求解导数值。 | |||
| ```python | |||
| import numpy as np | |||
| def f(x): | |||
| return x**2 | |||
| def numerical_diff(f, x): | |||
| h = 1e-4 | |||
| return (f(x+h) - f(x-h)) / (2*h) | |||
| x = 5.0 | |||
| ``` | |||
| 当然,你现在需要用C++来完成这件事。 | |||
| **[TASK 7]** 补全`operators/autodiff.h`中的`central_difference`函数,实现数值微分,求出$f(x_1, x_2, ..., x_n)$在第$arg$个参数处的导数值。 | |||
| #### 高等数学中的导数 | |||
| 还记得$z = x + y$,对$x$和$y$分别求导的结果是什么吗?显然,根据多元函数的求导法则,有$\frac{\partial z}{\partial x}=1$,以及$\frac{\partial z}{\partial y}=1$。如果我们再考虑梯度,那么$z$的梯度就是$\nabla z = (1, 1)$。那么,对于更复杂的函数,比如$f(x, y) = x^2 + y^2$,其梯度$\nabla f$又是什么呢? | |||
| **[TASK 8]** 补全`operators/autodiff.h`中的`Add`类,能够对表达式$z = x + y$求导。 | |||
| 提示:补全`forward`和`backward`函数,分别实现前向传播和反向传播。前向传播:得到`a + b`的值;反向传播,得到`a`和`b`的梯度(也就是`a`、`b`分别对于结果的导数再乘上梯度`d_input`)。 | |||
| **[TASK 9]** 仿照`Add`类构造`operators/autodiff.h`中的`Mul`类,能够对表达式$z = x \cdot y$求导。 | |||
| **[TASK 10]** 仿照`Add`类构造`operators/autodiff.h`中的`Log`类,能够对表达式$z = log(x)$求导。提示:使用`<cmath>`提供的`logf`函数。 | |||
| **[TASK 11]** 仿照`Add`类构造`operators/autodiff.h`中的`Inv`类,能够对表达式$z = 1 / x$求导。 | |||
| **[TASK 12]** 仿照`Add`类构造`operators/autodiff.h`中的`Sigmoid`类,能够对表达式$z = sigmoid(x)$求导。提示:使用`<cmath>`提供的`expf`函数。 | |||
| #### 检查结果 | |||
| 做完了?很好,切换到`cc`,执行下面的语句来编译框架 | |||
| ``` | |||
| cmake -S . -B build | |||
| cd build | |||
| make | |||
| ``` | |||
| 如果你已经完成了01,那么环境变量应该是好的。否则,请回到01的实验手册,查看如何修改环境变量。 | |||
| 现在还有一个`frontend/framework/autodiff/test_task7.py`文件。切换到目录`frontend/framework/autodiff/`,直接运行相应的task文件,如果没有任何报错,说明你已经完成了这一关!🎉 | |||
| @@ -0,0 +1,125 @@ | |||
| ### 第三部分:进入人工智能的世界 | |||
| > 前两关是不是很简单? | |||
| 相信你在前两部分中,已经积累了足够多的C++知识,也回忆起了足够多的高等数学知识。现在,我们要构造一个框架,这个框架可以接受一个矩阵作为输入,并且支持神经网络中的常见的网络层,例如 | |||
| - 线性层(Linear) | |||
| - 激活层(Activation) | |||
| - 损失层(Loss) | |||
| #### 张量类 | |||
| 我们已经在`cc/tensor/tensor.h`中定义了张量类,这个类可以表示一个多维数组,并且支持常见的数学运算。我们可以在`cc/tensor/tensor.cc`中实现这些运算。当然,我们假定所有的张量都是二维的,这样你就不必考虑各种情况。 | |||
| **[TASK 13]** 补全`cc/tensor/tensor.cc`中关于`Tensor::transpose()`的函数实现。它能够将一个张量进行转置。 | |||
| **[TASK 14]** 补全`cc/tensor/tensor.cc`中关于`argmax(const std::shared_ptr<Tensor>& tensor, int axis)`的函数实现,它能够返回一个张量在指定维度上的最大值的索引。提示:你可以使用`std::numeric_limits<float>::infinity()`,可以通过LLM来查询它的含义。 | |||
| > 前面做了这么多次测试,你是不是该自己学会写测试了?...算了,还是我来帮你写吧...😂 | |||
| 测试文件:`frontend/framework/tensor/task13_14.py` | |||
| **关于测试用例** 之后的内容的测试用例可以参考`frontend/uct/test`下的文件,或依据自己的需要编写。 | |||
| #### 线性层 | |||
| 线性层是神经网络中最为常见的网络层,它接受一个输入张量,并且输出一个张量。输入两个张量`feature: (batch_size x input_features)`和`weight: (input_features x output_features)`,输出张量`output: (batch_size x output_features)`,实际上就是将`feature`矩阵和`weight`矩阵相乘。 | |||
| 用公式表示就是$y = Wx + b$。 | |||
| **[TASK 15]** 补全`cc/operators/nn.h`中`Linear`类的构造函数和`forward`函数。 | |||
| - 构造函数:构造函数接受两个参数`a`和`b`,它们都是`std::shared_ptr<Node>`类型的智能指针,分别表示输入特征和权重。构造函数调用基类`FunctionNode`的构造函数,并将`a`和`b`传递给它。在构造函数中,调用`this->forward()`方法,并将结果赋值给`this->data`。 | |||
| - `forward`函数:参见有关线性层的介绍。 | |||
| **[TASK 16]** 补全`cc/operators/nn.cc`中`Linear`类的`backward`函数。 | |||
| - `backward()`函数实现反向传播,计算梯度并返回。它接受`std::shared_ptr<tensor::Tensor> gradient`作为输入,你需要计算`grad_features`和`grad_weights`,它们分别表示对`features`和`weights`的梯度。 | |||
| > 数学Tips:`grad_features`是通过将`gradient`与`weights`的转置相乘得到的。`grad_weights`是通过将`features`的转置与`gradient`相乘得到的。 | |||
| 完成了这两个任务后,你应该可以在`cc/`下执行 | |||
| ``` | |||
| cmake -S . -B build | |||
| cmake --build build | |||
| ``` | |||
| 就能够编译你的代码。然后,你应当可以运行`frontend/uct/perception.py`,它将使用你实现的线性层来训练一个感知机。 | |||
| #### 激活层 | |||
| 激活层是神经网络中常见的网络层,它接受一个输入张量,并且输出一个张量。输入一个张量`x`,输出一个张量`y`,实际上就是将`x`中的每个元素进行某种变换。 | |||
| 用公式表示就是$y = f(x)$。对于`ReLU`函数来说,$y = max(0, x)$。 | |||
| **[TASK 17]** 补全`cc/operators/nn.h`中`ReLU`类的构造函数和`forward`函数。 | |||
| - 构造函数:构造函数接受一个参数`a`,它是一个`std::shared_ptr<Node>`类型的智能指针,表示输入特征。构造函数调用基类`FunctionNode`的构造函数,并将`a`传递给它。在构造函数中,调用`this->forward()`方法,并将结果赋值给`this->data`。 | |||
| - `forward`函数:参见有关激活层的介绍。 | |||
| **[TASK 18]** 补全`cc/operators/nn.cc`中`ReLU`类的`backward`函数。 | |||
| - `backward()`函数实现反向传播,计算梯度并返回。它接受`std::shared_ptr<tensor::Tensor> gradient`作为输入,你需要计算`grads`,它表示对`features`的梯度。 | |||
| > 数学Tips:`grads`是通过将`gradient`与`x`中大于0的元素对应相乘得到的。 | |||
| #### 偏置 | |||
| 线性层中,我们没有实现偏置项`b`,它是一个向量,它的维度与输出特征的维度相同。偏置项的作用是使得线性层的输出能够更好地拟合数据。 | |||
| **[TASK 19]** 补全`cc/operators/nn.h`中`AddBias`类的构造函数和`forward`函数。 | |||
| - 构造函数:构造函数接受两个参数`a`和`b`,它们都是`std::shared_ptr<Node>`类型的智能指针,分别表示输入特征和偏置。构造函数调用基类`FunctionNode`的构造函数,并将`a`和`b`传递给它。在构造函数中,调用`this->forward()`方法,并将结果赋值给 `this->data`。 | |||
| - `forward`函数:`forward`方法实现前向传播,将偏置添加到输入特征上。`features`和`bias`分别从`this->objects`中获取,`features`的形状为`(batch_size x num_features)`,`bias`的形状为`(1 x num_features)`。在函数中,需要创建一个与`features`形状相同的输出张量`outNode`,使用嵌套循环将`features`的每个元素与`bias`的对应元素相加,结果存储在`outNode`中。最后,返回`outNode`。 | |||
| **[TASK 20]** 补全`cc/operators/nn.cc`中`AddBias`类的`backward`函数。 | |||
| - `backward()`函数实现反向传播,计算梯度并返回。它接受`std::shared_ptr<tensor::Tensor> gradient`作为输入,你需要计算`grad_features`和`grad_bias`,它们分别表示对`features`和`bias`的梯度。 | |||
| > 数学Tips:`grad_features`和`grad_bias`都是`gradient`的拷贝。但是考虑到我们有`batch_size`的存在,因此,在计算`bias`的梯度时,需要将`gradient`的每一列相加,得到`grad_bias`的对应元素。 | |||
| #### 损失层——均方误差损失函数 | |||
| 我们首先实现均方误差损失函数,它接受两个张量`y_pred`和`y_true`,它们分别表示预测值和真实值,输出一个标量,表示预测值与真实值之间的误差。 | |||
| 用公式表示就是$\displaystyle loss = \frac{1}{2} \sum_{i=1}^{n} (y_{pred} - y_{true})^2$。 | |||
| **[TASK 21]** 补全`cc/operators/nn.h`中`SquareLoss`类的构造函数和`forward`函数。 | |||
| - 构造函数:构造函数接受两个参数`a`和`b`,它们都是`std::shared_ptr<Node>`类型的智能指针,分别表示预测值和真实值。构造函数调用基类`FunctionNode`的构造函数,并将`a`和`b`传递给它。在构造函数中,调用`this->forward()`方法,并将结果赋值给`this->data`。 | |||
| - `forward`函数用于计算损失。 | |||
| **[TASK 22]** 补全`cc/operators/nn.cc`中`SquareLoss`类的`backward`函数。 | |||
| - `backward`函数计算损失函数相对于输入`a`和`b`的梯度。`gradient`是损失函数对输出的梯度(是一个形状为(1, 1)的张量,可以直接认为其是一个向量`g`)。`grad_a`和`grad_b`分别存储`a`和`b`的梯度。对于每个元素,梯度计算为`g * (a->data->data[i] - b->data->data[i]) / a->data->size`。最终返回 grad_a 和 grad_b 的向量。 | |||
| #### 损失层——SoftmaxLoss | |||
| 接下来,我们实现Softmax损失函数,它接受两个张量`y_pred`和`y_true`,它们分别表示预测值和真实值,输出一个标量,表示预测值与真实值之间的误差。 | |||
| 用公式表示就是$\displaystyle loss = -\sum_{i=1}^{n} y_{true} \log(y_{pred})$。 | |||
| **[TASK 23]** 补全`cc/operators/nn.h`中`SoftmaxLoss`类的构造函数,`forward`函数和`backward`函数。 | |||
| 完成上述内容后,你可以编译和运行`frontend/uct/regression.py`,使用线性网络来拟合`sin`函数。 | |||
| ### 手写体识别 | |||
| 补全代码中的其他标注有`TODO`的内容,最后编译运行,你就将能够训练一个手写体识别模型。可以运行`frontend/uct/mnist.py`来试一下吧! | |||
| > 是不是觉得运行得有点慢?考虑使用多线程来加速矩阵运算。(这已经超出了这门课的要求,对高性能计算/并行计算感兴趣的同学可以勇于尝试!) | |||
| ### extra bonus | |||
| 想打副本? | |||
| ``` | |||
| nslookup -type=txt uc-cpp.shahe.org | |||
| ``` | |||