finish task design

1 year ago · 481bd3b4c2
--- a/cc/CMakeLists.txt
+++ b/cc/CMakeLists.txt
@@ -0,0 +1,15 @@
 cmake_minimum_required(VERSION 3.19)
 project(uctc)
 set(CMAKE_CXX_STANDARD 17)
 set(CXX g++)
 set(CMAKE_CXX_FLAGS -O3)
 set(PYBIND11_DIR /home/hexu/miniconda3/lib/python3.11/site-packages/pybind11)
 set(PYBIND11_FINDPYTHON ON)
 find_package(pybind11 CONFIG REQUIRED PATHS ${PYBIND11_DIR})
 pybind11_add_module(uctc uctc.cc math/arith.cc operators/nn.cc operators/ops.cc tensor/tensor.cc)

 add_custom_command(
    TARGET uctc POST_BUILD
    COMMAND ${CMAKE_COMMAND} -E echo "Changing directory and running Python script for generate interpreter annotations"
    COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR} pybind11-stubgen uctc --output-dir .
 )
--- a/cc/math/arith.cc
+++ b/cc/math/arith.cc
@@ -0,0 +1,14 @@
 #include "arith.h"

 namespace arith {

 float sqrt(float x) {
    return sqrtf(x);
 }

 float mean(const std::vector<int>& x) {
    return std::accumulate(x.begin(), x.end(), 0) / x.size();
 }


 }
--- a/cc/math/arith.h
+++ b/cc/math/arith.h
@@ -0,0 +1,20 @@
 #pragma once
 #include <cmath>
 #include <vector>
 #include <numeric>

 namespace arith {

 float sqrt(float x);
 float mean(const std::vector<int>& x);

 template<typename T>
 void mm(const std::vector<T>& a, const std::vector<T>& b, std::vector<T>& c, size_t m, size_t k, size_t n) {
    // 补全这里，谢谢
 }

 template<typename T>
 void vector_scalar_max(const std::vector<T>& a, std::vector<T> &b, T scalar) {
    // 补全这里，谢谢
 }
 }
--- a/cc/operators/autodiff.cc
+++ b/cc/operators/autodiff.cc
@@ -0,0 +1,32 @@
 #include "autodiff.h"

 namespace autodiff {

 std::vector<std::shared_ptr<ScalarFunction>> topoSort(const std::vector<std::shared_ptr<ScalarFunction>>& scalars) {
    std::vector<std::shared_ptr<ScalarFunction>> sorted;
    std::vector<std::shared_ptr<ScalarFunction>> frontier;
    std::unordered_map<std::shared_ptr<ScalarFunction>, int> degree;
    for (auto it: scalars) {
        if (it->degree == 0) {
            frontier.push_back(it);
        }
        else {
            degree.insert({it, it->degree});
        }
    }
    while (!frontier.empty()) {
        auto back = frontier.back();
        sorted.push_back(back);
        for (auto &it: degree) {
            if (it.second > 0 && it.first == back) {
                it.second--;
                if (it.second == 0) {
                    frontier.push_back(it.first);
                }
            }
        }
    }
    return sorted;
 }

 }
--- a/cc/operators/autodiff.h
+++ b/cc/operators/autodiff.h
@@ -0,0 +1,211 @@
 #pragma once
 #include <vector>
 #include <memory>
 #include <cmath>
 #include <unordered_map>

 namespace autodiff {

 template<typename T, typename F>
 auto central_difference(std::vector<T>& vec, F func, std::size_t arg, float epsilon = 1e-6) {
    // 补全函数，并修改return语句
    return 0;
 }

 class ScalarFunction {
 public:
    float data;
    float grad;
    int degree = 0;
 public:
    ScalarFunction() {}
 }; // class ScalarFunction

 class ConstantScalar: public ScalarFunction {
 public:
    ConstantScalar(float data): ScalarFunction() {
        this->data = data;
    }
 }; // class ConstantScalar

 class Add: public ScalarFunction {
 public:
    std::shared_ptr<ScalarFunction> a;
    std::shared_ptr<ScalarFunction> b;
 public:
    // 思考这个构造函数的写法（或让LLM进行解释）
    Add(std::shared_ptr<ScalarFunction> a, std::shared_ptr<ScalarFunction> b): a(a), b(b) {
        this->data = a->data + b->data;
        this->degree = 2;
    }
    float forward() {
        // 修改这里的return
        return 0;
    }
    std::vector<float> backward(float d_input) {
        // 修改这里的return
        return {0, 0};
    }
 }; // class Add

 class Log: public ScalarFunction {
 public:
    std::shared_ptr<ScalarFunction> a;
 public:
    Log(std::shared_ptr<ScalarFunction> a): a(a) {
        this->data = this->forward();
        this->degree = 1;
    }
    float forward() {
        // 补全这里的return语句
        return 0.0f;
    }
    std::vector<float> backward(float d_input) {
        // 算了，我来帮你写求导的部分吧
        // 估计你已经忘记$log(x)$求导是什么了
        return {(1.0f * d_input / a->data)};
    }
 }; // class Log

 class Mul: public ScalarFunction {
 public:
    std::shared_ptr<ScalarFunction> a;
    std::shared_ptr<ScalarFunction> b;
 public:
    Mul(std::shared_ptr<ScalarFunction> a, std::shared_ptr<ScalarFunction> b) : a(a), b(b) {
        this->data = this->forward();
        this->degree = 2;
    }
    float forward() {
        // 修改这里的return
        return 0;
    }
    std::vector<float> backward(float d_input) {
        // 修改这里的return
        return {0, 0};
    }
 }; // class Mul

 class Inv: public ScalarFunction {
 public:
    std::shared_ptr<ScalarFunction> a;
 public:
    Inv(std::shared_ptr<ScalarFunction> a): a(a) {
        this->data = this->forward();
        this->degree = 1;
    }
    float forward() {
        return 1.0f / a->data;
    }
    std::vector<float> backward(float d_input) {
        // 修改这里的return语句
        // 1/x求导是-1/x^2
        return {0.0f};
    }
 }; // class Inv

 class Sigmoid: public ScalarFunction {
 public:
    std::shared_ptr<ScalarFunction> a;
 public:
    Sigmoid(std::shared_ptr<ScalarFunction> a): a(a) {
        this->data = this->forward();
        this->degree = 1;
    }
    float forward() {
        if (this->a->data >= 0.0) {
            return 1.0 / (1.0 + expf(-this->a->data));
        }
        else {
            return expf(this->a->data) / (1.0 + expf(this->a->data));
        }
    }
    std::vector<float> backward(float d_input) {
        // 你还是来求一下导吧，预防上大学以后变傻了
        // 补全这里的代码
        return {0.0f};
    }
 }; // class Sigmoid

 // for testing
 bool test_central_difference() {
    std::vector<float> x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
    auto func = [](const std::vector<float>& x) -> float {
        return x[0] + x[1] + x[2] + x[3] + x[4];
    };
    auto grad = central_difference(x, func, 2);
    if (abs(grad-1.0f) > 1e-4) {
        return false;
    }
    return true;
 }

 bool test_addscalar() {
    auto a = std::make_shared<ConstantScalar>(1.0f);
    auto b = std::make_shared<ConstantScalar>(2.0f);
    auto c = std::make_shared<Add>(a, b);
    if (c->data != 3.0f) {
        return false;
    }
    auto res = c->backward(2.0f);
    auto a_grad = res[0];
    auto b_grad = res[1];
    if (a_grad != 2.0f || b_grad != 2.0f) {
        return false;
    }
    return true;
 }

 bool test_mulscalar() {
    auto a = std::make_shared<ConstantScalar>(2.0f);
    auto b = std::make_shared<ConstantScalar>(3.0f);
    auto c = std::make_shared<Mul>(a, b);
    if (c->data != 6.0f) {
        return false;
    }
    auto res = c->backward(2.0f);
    auto a_grad = res[0];
    auto b_grad = res[1];
    if (a_grad != 6.0f || b_grad != 4.0f) {
        return false;
    }
    return true;
 }

 bool test_logscalar() {
    auto a = std::make_shared<ConstantScalar>(2.0f);
    auto b = std::make_shared<Log>(a);
    if (abs(b->data - logf(2.0f)) > 1e-4) {
        return false;
    }
    auto res = b->backward(2.0f);
    auto a_grad = res[0]; 
    if (abs(a_grad - 1.0f) > 1e-4) {
        return false;
    }
    return true;
 }

 bool test_invscalar() {
    auto a = std::make_shared<ConstantScalar>(2.0f);
    auto b = std::make_shared<Inv>(a);
    if (abs(b->data - 0.5f) > 1e-4) {
        return false;
    }
    auto res = b->backward(2.0f);
    auto a_grad = res[0];
    if (abs(a_grad + 0.5f) > 1e-4) {
        return false;
    }
    return true;
 }

 bool test_sigmoidscalar() {
    auto a = std::make_shared<ConstantScalar>(2.0f);
    auto b = std::make_shared<Sigmoid>(a);
    // TODO：麻烦自己写下测试用例，谢谢
    // 禁止直接return true，世界上最聪明的智能人工将会逐一检查这段代码
    return false;
 }

 }
--- a/cc/operators/nn.cc
+++ b/cc/operators/nn.cc
@@ -0,0 +1,87 @@
 #include "nn.h"

 namespace nn {

 std::shared_ptr<tensor::Tensor> log_softmax(std::shared_ptr<tensor::Tensor> logits) {
    auto batch_size = logits->shape[0];
    auto num_classes = logits->shape[1];
    auto log_probs_shape = {batch_size, num_classes};
    auto log_probs = std::make_shared<tensor::Tensor>(log_probs_shape);

    for (auto i = 0; i < batch_size; i++) {
        auto max_logit = logits->data[i * num_classes];
        for (auto j = 1; j < num_classes; j++) {
            max_logit = max_logit > logits->data[i * num_classes + j] ? max_logit : logits->data[i * num_classes + j];
        }

        auto sum_exp = 0.0;
        for (auto j = 0; j < num_classes; j++) {
            log_probs->data[i * num_classes + j] = logits->data[i * num_classes + j] - max_logit;
            sum_exp += exp(log_probs->data[i * num_classes + j]);
        }

        // calculate log(softmax)
        auto log_sum_exp = log(sum_exp);
        for (auto j = 0; j < num_classes; j++) {
            log_probs->data[i * num_classes + j] -= log_sum_exp;
        }
    }

    return log_probs;
 }

 std::vector<std::shared_ptr<tensor::Tensor>> gradients(std::shared_ptr<Loss> loss, std::vector<std::shared_ptr<Node>> parameters) {
    loss->used = true;

    std::unordered_set<std::shared_ptr<Node>> nodes;
    std::vector<std::shared_ptr<Node>> tape;

    // 递归遍历图并构建计算图
    std::function<void(std::shared_ptr<Node>)> visit = [&](std::shared_ptr<Node> node) {
        if (nodes.find(node) == nodes.end()) {
            for (const auto& parent : node->get_parents()) {
                visit(parent);
            }
            nodes.insert(node);
            tape.push_back(node);
        }
    };

    visit(loss);
    for (const auto& param : parameters) {
        nodes.insert(param);
    }

    std::unordered_map<std::shared_ptr<Node>, std::shared_ptr<tensor::Tensor>> grads;
    for (const auto& node : nodes) {
        grads[node] = std::make_shared<tensor::Tensor>(node->data->shape);
    }
    grads[loss] = std::make_shared<tensor::Tensor>(loss->data->shape);
    grads[loss]->data[0] = 1.0;

    for (auto it = tape.rbegin(); it != tape.rend(); it++) {
        // std::cout << "tape it: " << std::endl;
        auto node = *it;
        // if (node->data->shape[0] == 1) {
        //     std::cout << "coming to squareloss" << std::endl;
        // }
        auto parent_grads = node->backward(grads[node]);
        auto parents = node->get_parents();
        for (size_t i = 0; i < parents.size(); i++) {
            // std::cout << "this grad shape: " << grads[parents[i]]->data.size() << std::endl;
            for (auto ind = 0; ind < parents[i]->data->size; ind++) {
                grads[parents[i]]->data[ind] += parent_grads[i]->data[ind];
            }
        }
    }

    std::vector<std::shared_ptr<tensor::Tensor>> result;
    for (const auto& param : parameters) {
        result.emplace_back(grads[param]);
    }

    // std::cout << "len(result): " << result.size() << std::endl;
    return result;
 }

 }
--- a/cc/operators/nn.h
+++ b/cc/operators/nn.h
@@ -0,0 +1,274 @@
 #pragma once
 #include <vector>
 #include <memory>
 #include <unordered_set>
 #include <unordered_map>
 #include <algorithm>
 #include <pybind11/pybind11.h>
 #include <pybind11/numpy.h>
 #include <iostream>
 #include "../tensor/tensor.h"
 #include "../math/arith.h"

 namespace py = pybind11;

 namespace nn {

 class Node {
 public:
    std::shared_ptr<tensor::Tensor> data;
    std::vector<std::shared_ptr<Node>> objects;
    std::vector<std::shared_ptr<tensor::Tensor>> gradient;
 public:
    Node() {}
    virtual std::shared_ptr<tensor::Tensor> forward() = 0;
    virtual std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) = 0;
    std::vector<std::shared_ptr<Node>> get_parents() {
        return this->objects;
    }
    std::vector<float> get_data() {
        return this->data->data;
    }
    std::shared_ptr<tensor::Tensor> get_tensor() {
        return this->data;
    }
    // virtual void update(std::shared_ptr<tensor::Tensor> grad, float lr) = 0;
    // virtual void zero_grad() = 0;
    virtual ~Node() {}
 };

 class DataNode: public Node {
 public:
    DataNode() {}
 }; // class DataNode

 class Parameter: public DataNode {
 public:
    // Parameter(const std::vector<std::size_t>& shape) {
    //     this->data = std::make_shared<tensor::Tensor>(shape, true);
    // }
    Parameter(py::array_t<float> array) {
        py::buffer_info info = array.request();
        float* dataPtr = static_cast<float*>(info.ptr);
        std::vector<std::size_t> shape = {};
        for (auto &it: info.shape) {
            shape.push_back(it);
        }
        auto tensor = std::make_shared<tensor::Tensor>(shape);
        std::vector<float> result(dataPtr, dataPtr + info.size);
        tensor->data = result;
        this->data = tensor;
    }
    std::shared_ptr<tensor::Tensor> forward() {
        return this->data;
    };
    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) {
        return {gradient};
    };
    void update(std::shared_ptr<tensor::Tensor> grad, double lr) {
        for (auto i = 0; i < this->data->size; i++) {
            this->data->data[i] -= lr * grad->data[i];
        }
    }
 }; // class Parameter

 class Constant: public DataNode {
 public:
    Constant(std::shared_ptr<tensor::Tensor> data) {
        this->data = data;
    }
    Constant(py::array_t<float> array) {
        this->data = tensor::pyarray_to_tensor(array);
    }
    std::shared_ptr<tensor::Tensor> forward() {
        return this->data;
    };
    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) {
        return {gradient};
    };
    // void update(std::shared_ptr<tensor::Tensor> grad, float lr) {}
 }; // class Constant

 class FunctionNode: public Node {
 public:
    FunctionNode(std::shared_ptr<Node> a, std::shared_ptr<Node> b) {
        this->objects.emplace_back(a);
        this->objects.emplace_back(b);
    }
    FunctionNode(std::shared_ptr<Node> a) {
        this->objects.emplace_back(a);
    }

    std::shared_ptr<tensor::Tensor> forward() override {
        return nullptr;
    }
 }; //class FunctionNode

 class Add: public FunctionNode {
 public:
    Add(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
        this->data = this->forward();
    }
    std::shared_ptr<tensor::Tensor> forward() override {
        auto a = this->objects[0];
        auto b = this->objects[1];
        auto outNode = std::make_shared<tensor::Tensor>(a->data->shape);
        for (auto i = 0; i < a->data->size; i++) {
            outNode->data[i] = a->data->data[i] + b->data->data[i];
        }
        return outNode;
    }
    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
        // assertion needed
        return {gradient, gradient};
    }
 };

 class AddBias: public FunctionNode {
 public:
    AddBias(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
        this->data = this->forward();
    }
    std::shared_ptr<tensor::Tensor> forward() override {
        // features: a Node with shape (batch_size x num_features)
        // bias: a Node with shape (1 x num_features)
        auto features = this->objects[0];
        auto bias = this->objects[1];
        auto outNode = std::make_shared<tensor::Tensor>(features->data->shape);
        // for循环写加法总会写吧🤔
        // 补全这里的代码
        return outNode;
    }
    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
        // assertion needed
        auto g_bias = std::make_shared<tensor::Tensor>(this->objects[1]->data->shape);
        // 补全这里的代码
        
        return {gradient, g_bias};
    }
    std::vector<float> get_data() {
        return this->data->data;
    }
 }; // class AddBias


 class Linear: public FunctionNode {
 public:
    Linear(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
        // 这段代码就一行，参考下别的类是怎么写的呢？
        // 在这里补全
    }
    std::shared_ptr<tensor::Tensor> forward() override {
        // features: (batch_size x input_features)
        auto features = this->objects[0];
        // weights: (input_features x output_features)
        auto weights = this->objects[1];
        auto m = features->data->shape[0];
        auto k = features->data->shape[1];
        auto n = weights->data->shape[1];
        // std::cout << m << " " << n << " " << k << std::endl;
        // output: (batch_size x output_features)
        auto shape = {m, n};
        auto outNode = std::make_shared<tensor::Tensor>(shape);
        // 实际上你需要补全的是arith::mm函数，快去找找它在哪里
        // 其余部分不需要动
        arith::mm(features->data->data, weights->data->data, outNode->data, m, k, n);
        return outNode;
    }

    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
        auto features = this->objects[0];
        auto weights = this->objects[1];
        // gradient.shape[0] == features.shape[0]
        // gradient.shape[1] == weights.shape[1]
        auto grad_features_shape = {gradient->shape[0], weights->data->shape[0]};
        auto grad_features = std::make_shared<tensor::Tensor>(grad_features_shape);
        auto grad_weights_shape = {features->data->shape[1], gradient->shape[1]};
        auto grad_weights = std::make_shared<tensor::Tensor>(grad_weights_shape);
        // 这里要调用两次arith:mm，是分别把哪两个矩阵相乘呢？
        return {grad_features, grad_weights};
    }
 }; //class Linear

 class ReLU: public FunctionNode {
 public:
    ReLU(std::shared_ptr<Node> a) : FunctionNode(a) {
        // 补全这里
    }
    std::shared_ptr<tensor::Tensor> forward() override {
        // x: a Node with shape (batch_size x num_features)
        auto outNode = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape);
        // 补全这里，调用arith::vector_scalar_max
        return outNode;
    }
    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
        auto grads = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape);
        // 补全这里，一个for循环
        
        return {grads};
    }
 }; // class ReLU

 class Loss: public FunctionNode {
 public:
    bool used = false;
 public:
    Loss(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {}
 };

 class SquareLoss: public Loss {
 public:
    SquareLoss(std::shared_ptr<Node> a, std::shared_ptr<Node> b): Loss(a, b) {
        // 补全这里的代码
    }
    std::shared_ptr<tensor::Tensor> forward() {
        // a: a Node with shape (batch_size x dim)
        // b: a Node with shape (batch_size x dim)
        // 这个简单，就是要注意返回的res需要是一个tensor就行
        // 修改下面的代码
        std::vector<size_t> res_shape = {1};
        auto res = std::make_shared<tensor::Tensor>(res_shape);
        return res;
    }
    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
        float g = gradient->data[0];
        auto a = this->objects[0];
        auto b = this->objects[1];
        auto grad_a = std::make_shared<tensor::Tensor>(a->data->shape);
        auto grad_b = std::make_shared<tensor::Tensor>(b->data->shape);
        // 补全下面的代码
        return {grad_a, grad_b};
    }
 }; // class SquareLoss

 std::shared_ptr<tensor::Tensor> log_softmax(std::shared_ptr<tensor::Tensor> logits);

 class SoftmaxLoss: public Loss {
 public:
    SoftmaxLoss(std::shared_ptr<Node> logits, std::shared_ptr<Node> labels): Loss(logits, labels) {
        this->data = this->forward();
    }

    std::shared_ptr<tensor::Tensor> forward() {
        // 我们已经帮你写好log_softmax
        auto log_probs = log_softmax(this->objects[0]->data);
        // 补全下面的代码，计算softmax loss
        std::vector<size_t> res_shape = {1};
        auto res = std::make_shared<tensor::Tensor>(res_shape);
        return res;
    }
    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
        auto log_probs = log_softmax(this->objects[0]->data);
        auto labels = this->objects[1]->data;
        auto batch_size = log_probs->shape[0];
        auto num_classes = log_probs->shape[1];
        auto grad_logits = std::make_shared<tensor::Tensor>(log_probs->shape);
        auto grad_labels = std::make_shared<tensor::Tensor>(labels->shape);
        // 补全下面的代码
        return {grad_logits, grad_labels};
    }
 }; // class SoftmaxLoss

 std::vector<std::shared_ptr<tensor::Tensor>> gradients(std::shared_ptr<Loss> loss, std::vector<std::shared_ptr<Node>> parameters);

 }
--- a/cc/operators/ops.cc
+++ b/cc/operators/ops.cc
@@ -0,0 +1,54 @@
 #include "ops.h"

 namespace operators {
 static float epsilon = 1e-6;

 float is_close(float x, float y) {
    // 请修改这里的return语句
    return 0.0;
 }

 float sigmoid(float x) {
    // 请修改这里的return语句
    return 0.0;
 }

 float relu(float x) {
    // 请修改这里的return语句
    return 0.0;
 }

 float inv(float x) {
    // 请修改这里的return语句
    return 0.0;
 }

 float inv_back(float x, float d) {
    // 请修改这里的return语句
    return 0.0;
 }

 float relu_back(float x, float d) {
    // 请修改这里的return语句
    return 0.0;
 }

 auto sumList(const std::vector<float>& vec) -> float {
    return reduce(vec, 0.0f, add<float>);
 }

 auto prodList(const std::vector<float>& vec) -> float {
    // 请修改这里的return语句
    return 0.0f;
 }

 auto addLists(const std::vector<float>& vec1, const std::vector<float>& vec2) -> std::vector<float> {
    // 请修改这里的return语句
    return std::vector<float>(1, 0.0f);
 }

 auto negList(const std::vector<float>& vec) -> std::vector<float> {
    // 请修改这里的return语句
    return std::vector<float>(1, 0.0f);
 }
 }
--- a/cc/operators/ops.h
+++ b/cc/operators/ops.h
@@ -0,0 +1,88 @@
 #pragma once
 #include <cmath>
 #include <functional>
 #include <vector>
 #include <algorithm>
 #include <stdexcept>
 #include <numeric>

 namespace operators {

 template<typename T>
 T mul(T a, T b) {
    return 0; // 请修改这里的return语句
 }

 template<typename T>
 T id(T a) {
    return 0; // 请修改这里的return语句
 }

 template<typename T>
 T add(T a, T b) {
    return 0; // 请修改这里的return语句
 }

 template<typename T>
 T neg(T a) {
    return 0; // 请修改这里的return语句
 }

 template<typename T>
 float lt(T a, T b) {
    return 0.0; // 请修改这里的return语句
 }

 template<typename T>
 float eq(T a, T b) {
    return 0.0; // 请修改这里的return语句
 }

 template<typename T>
 T max(T a, T b) {
    return 0; // 请修改这里的return语句
 }

 template<typename T, typename F>
 auto map(const std::vector<T>& vec, F func) -> std::vector<decltype(func(std::declval<T>()))> {

    std::vector<decltype(func(std::declval<T>()))> result;
    result.reserve(vec.size());

    std::transform(vec.begin(), vec.end(), std::back_inserter(result), func);

    return result;
 }

 template <typename T1, typename T2, typename F>
 auto zipWith(const std::vector<T1>& vec1, const std::vector<T2>& vec2, F func)
    -> std::vector<decltype(func(std::declval<T1>(), std::declval<T2>()))> {

    if (vec1.size() != vec2.size()) {
        // 我们已经在这里throw一个异常
        throw std::invalid_argument("Vectors must have the same size");
    }
    // 请在这里补全其他部分
    // 提醒：可以使用push_back函数向vector添加元素
    // 再给你降一点难度：这里需要仿照map函数神明一个result变量。

    return std::vector<decltype(func(std::declval<T1>(), std::declval<T2>()))>(1); // 这里记得改掉，改成result
 }

 template<typename T, typename F>
 auto reduce(const std::vector<T>& vec, T init, F func) -> T {
    return std::accumulate(vec.begin(), vec.end(), init, func);
 }

 float is_close(float x, float y);
 float sigmoid(float x);
 float relu(float x);
 float inv(float x);
 float inv_back(float x, float d);
 float relu_back(float x, float d);

 auto sumList(const std::vector<float>& vec) -> float;
 auto prodList(const std::vector<float>& vec) -> float;
 auto addLists(const std::vector<float>& vec1, const std::vector<float>& vec2) -> std::vector<float>;
 auto negList(const std::vector<float>& vec) -> std::vector<float>;
 }
--- a/cc/tensor/pyarray.cc
+++ b/cc/tensor/pyarray.cc
@@ -0,0 +1,12 @@
 #include "pyarray.h"

 namespace pyarr {

 std::vector<float> ndarray_to_vector(py::array_t<float> array) {
    py::buffer_info info = array.request();
    float* dataPtr = static_cast<float*>(info.ptr);
    std::vector<float> result(dataPtr, dataPtr + info.size);
    return result;
 }

 }
--- a/cc/tensor/pyarray.h
+++ b/cc/tensor/pyarray.h
@@ -0,0 +1,10 @@
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>

 namespace py = pybind11;

 namespace pyarr {

 std::vector<float> ndarray_to_vector(py::array_t<float> array);

 }
--- a/cc/tensor/tensor.cc
+++ b/cc/tensor/tensor.cc
@@ -0,0 +1,76 @@
 #include "tensor.h"

 namespace tensor {

 std::shared_ptr<Tensor> Tensor::transpose() {
    // 放心，下面的代码暂时不会被触发，我们假定所有的tensor都是2维的
    // if (shape.size() != 2) {
    //     throw std::runtime_error("Transpose is only supported for 2D tensors.");
    // }

    // 这里能够获得矩阵的行数和列数，但是我们是使用一个一维的vector来存储数据的。该如何实现“转置”呢？
    std::size_t rows = shape[0];
    std::size_t cols = shape[1];
    std::vector<size_t> new_shape = {cols, rows};
    // 你知道这里的size变量在哪里定义的吗？在VSCode里面安装C/C++ Extension Pack后，按下ctrl键并单击变量size，VSCode就会把你导向定义这个变量的地方！
    std::vector<float> transposed_data(size);

    // 请在这里写转置的代码

    // 请阅读关于Tensor的定义，在这里创建一个新的Tensor
    // 注意，要使用shared_ptr哦！
    
    return std::make_shared<Tensor>(new_shape);
 }


 std::shared_ptr<Tensor> pyarray_to_tensor(py::array_t<float> array) {
    py::buffer_info info = array.request();
    float* dataPtr = static_cast<float*>(info.ptr);
    std::vector<std::size_t> shape = {};
    for (auto &it: info.shape) {
        shape.push_back(it);
    }
    auto tensor = std::make_shared<Tensor>(shape);
    std::vector<float> result(dataPtr, dataPtr + info.size);
    tensor->data = result;
    return tensor;
 }

 std::shared_ptr<Tensor> argmax(const std::shared_ptr<Tensor>& tensor, int axis) {
    // you only need to handle the two dimensional tensor, and the axis can be either 0 or 1
    // the tensor's shape is (batch_size, features)
    // if the axis is 0, it outputs a tensor (1, features)
    // if the axis is 1, it outputs a tensor (batch_size, 1)

    // compute the output's shape
    std::vector<std::size_t> output_shape = tensor->shape;
    output_shape.erase(output_shape.begin() + axis);

    auto result = std::make_shared<Tensor>(output_shape);
    // 这个问题似乎有点难，所以我们决定给你送点分。一个简单的办法是分axis为0还是为1来进行讨论，反正我们已经把问题简化为了，在一个二维的tensor里面，找到每一行或者每一列的最大值，并输出一个一维的tensor。
    // 补全这里的代码。
    return result;
 }

 std::shared_ptr<Tensor> mean(const std::shared_ptr<Tensor>& tensor) {
    std::vector<std::size_t> shape = {1};
    auto result = std::make_shared<Tensor>(shape);
    auto sum = 0.0f;
    for (auto &it: tensor->data) {
        sum += it;
    }
    sum /= tensor->size;
    result->data[0] = sum;
    return result;
 }

 std::shared_ptr<Tensor> exp(const std::shared_ptr<Tensor>& tensor) {
    auto result = std::make_shared<Tensor>(tensor->shape);
    for (auto i = 0; i < tensor->size; i++) {
        result->data[i] = expf(tensor->data[i]);
    }
    return result;
 }

 }
--- a/cc/tensor/tensor.h
+++ b/cc/tensor/tensor.h
@@ -0,0 +1,92 @@
 #pragma once
 #include <numeric>
 #include <random>
 #include <vector>
 #include <memory>
 #include <stdexcept>
 #include <pybind11/pybind11.h>
 #include <pybind11/numpy.h>

 namespace py = pybind11;

 namespace tensor {

 class Tensor {
 public:
    std::vector<float> data;
    std::vector<std::size_t> shape;
    std::size_t size;

 public:
    Tensor(const std::vector<std::size_t>& shape, bool rand_init = false) {
        this->size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
        this->data.resize(this->size);
        this->shape = shape;
        if (rand_init) {
            double limit = std::sqrt(3.0 / ((shape[0] + shape[1]) / 2.0));
            std::mt19937 gen(42);
            std::uniform_real_distribution<float> dis(-limit, limit);
            for (std::size_t i = 0; i < this->size; ++i) {
                this->data[i] = dis(gen);
            }
        }
    }
    std::shared_ptr<Tensor> transpose();

    Tensor operator+(const Tensor& other) const {
        if (this->shape != other.shape) {
            throw std::runtime_error("Shapes do not match");
        }
        Tensor result(this->shape);
        for (std::size_t i = 0; i < this->size; ++i) {
            result.data[i] = this->data[i] + other.data[i];
        }
        return result;
    }

    Tensor operator=(const Tensor& other) const {
        if (this->shape != other.shape) {
            throw std::runtime_error("Shapes do not match");
        }
        Tensor result(this->shape);
        for (auto i = 0; i < this->size; i++) {
            result.data[i] = (this->data[i] == other.data[i]); 
        }
        return result;
    }

    std::vector<std::size_t> get_shape() const {
        return this->shape;
    }

    std::vector<float> get_data() const {
        return this->data;
    }

    float get(const std::vector<std::size_t>& indices) const {
        std::size_t index = 0;
        std::size_t stride = 1;
        for (int i = shape.size() - 1; i >= 0; i--) {
            index += indices[i] * stride;
            stride *= shape[i];
        }
        return data[index];
    }

    void set(const std::vector<std::size_t>& indices, float value) {
        std::size_t index = 0;
        std::size_t stride = 1;
        for (int i = shape.size() - 1; i >= 0; i--) {
            index += indices[i] * stride;
            stride *= shape[i];
        }
        data[index] = value;
    }
    ~Tensor() = default;
 };  // class Tensor

 std::shared_ptr<Tensor> pyarray_to_tensor(py::array_t<float> array);
 std::shared_ptr<Tensor> argmax(const std::shared_ptr<Tensor>& tensor, int axis);
 std::shared_ptr<Tensor> mean(const std::shared_ptr<Tensor>& tensor);
 std::shared_ptr<Tensor> exp(const std::shared_ptr<Tensor>& tensor);
 }  // namespace tensor
--- a/cc/uctc.cc
+++ b/cc/uctc.cc
@@ -0,0 +1,117 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include "math/arith.h"
 #include "operators/nn.h"
 #include "tensor/tensor.h"
 #include "operators/ops.h"
 #include "operators/autodiff.h"

 namespace py = pybind11;

 PYBIND11_MODULE(uctc, m) {

    py::module C = m.def_submodule("C", "C module");

    py::module arith = C.def_submodule("arith", "Arithmetic module");
    arith.def("sqrt", &arith::sqrt, "Square root function", py::arg("x") = 0.0);

    py::class_<tensor::Tensor, std::shared_ptr<tensor::Tensor>>(m, "Tensor")
    .def_readonly("shape", &tensor::Tensor::shape)
    .def_readonly("size", &tensor::Tensor::size)
    .def("data", &tensor::Tensor::get_data, "Get the data of the tensor", pybind11::return_value_policy::copy)
    .def("transpose", &tensor::Tensor::transpose, "Transpose the tensor", pybind11::return_value_policy::copy);
    
    py::module nn = m.def_submodule("nn", "Neural network module");
    py::class_<nn::Node, std::shared_ptr<nn::Node>>(nn, "Node")
    .def("data", &nn::Node::get_data, "Get the data of the node", pybind11::return_value_policy::copy)
    .def("tensor", &nn::Node::get_tensor, "Get the tensor of the node", pybind11::return_value_policy::automatic_reference);

    py::class_<nn::DataNode, nn::Node, std::shared_ptr<nn::DataNode>>(nn, "DataNode");

    py::class_<nn::Parameter, nn::DataNode, std::shared_ptr<nn::Parameter>>(nn, "Parameter")
    .def(pybind11::init<py::array_t<float>>(), "Create a parameter from an array.")
    .def("update", &nn::Parameter::update, "Update the parameter node", py::arg("grad") = nullptr, py::arg("learning_rate") = 0.001);

    py::class_<nn::Constant, nn::DataNode, std::shared_ptr<nn::Constant>>(nn, "Constant")
    .def(pybind11::init<py::array_t<float>>(), "Create a constant node from a numpy array");

    py::class_<nn::FunctionNode, nn::Node, std::shared_ptr<nn::FunctionNode>>(nn, "FunctionNode");

    py::class_<nn::Add, nn::FunctionNode, std::shared_ptr<nn::Add>>(nn, "Add")
    .def(py::init<std::shared_ptr<nn::Node>, std::shared_ptr<nn::Node>>(), "Create an add function node")
    .def("forward", &nn::Add::forward, "Forward function");

    py::class_<nn::AddBias, nn::FunctionNode, std::shared_ptr<nn::AddBias>>(nn, "AddBias")
    .def(py::init<std::shared_ptr<nn::Node>, std::shared_ptr<nn::Node>>(), "Create an add bias function node")
    .def("forward", &nn::AddBias::forward, "Forward function")
    .def("data", &nn::AddBias::get_data, "Get the data of the node", pybind11::return_value_policy::automatic_reference);

    py::class_<nn::Linear, nn::FunctionNode, std::shared_ptr<nn::Linear>>(nn, "Linear")
    .def(py::init<std::shared_ptr<nn::Node>, std::shared_ptr<nn::Node>>(), "Create a linear function node")
    .def("forward", &nn::Linear::forward, "Forward function");

    py::class_<nn::ReLU, nn::FunctionNode, std::shared_ptr<nn::ReLU>>(nn, "ReLU")
    .def(py::init<std::shared_ptr<nn::Node>>(), "Create a ReLU function node");

    py::class_<nn::Loss, nn::FunctionNode, std::shared_ptr<nn::Loss>>(nn, "Loss");

    py::class_<nn::SquareLoss, nn::Loss, std::shared_ptr<nn::SquareLoss>>(nn, "SquareLoss")
    .def(py::init<std::shared_ptr<nn::Node>, std::shared_ptr<nn::Node>>(), "Create a square loss function node");
    py::class_<nn::SoftmaxLoss, nn::Loss, std::shared_ptr<nn::SoftmaxLoss>>(nn, "SoftmaxLoss")
    .def(py::init<std::shared_ptr<nn::Node>, std::shared_ptr<nn::Node>>(), "Create a softmax loss function node");
    
    nn.def("log_softmax", &nn::log_softmax, "Log softmax function", py::arg("logits"));

    nn.def("gradients", &nn::gradients, "Calculate the gradients", py::arg("loss") = nullptr, py::arg("nodes") = std::vector<std::shared_ptr<nn::Node>>{});
    nn.def("pyarray_to_tensor", &tensor::pyarray_to_tensor, "Convert a numpy array to a tensor", py::arg("arr"));
    nn.def("argmax", &tensor::argmax, "Get a tensor's argmax", py::arg("tensor"), py::arg("axis"));
    nn.def("mean", &tensor::mean, "Get a tensor element's mean value", py::arg("tensor"));
    nn.def("exp", &tensor::exp, "Get exp of a tensor", py::arg("tensor"));

    // framework test
    py::module framework = m.def_submodule("framework", "Framework module");
    py::module basis = framework.def_submodule("basis", "Basic modules");
    
    // task 1
    basis.def("mul", &operators::mul<int>, "Multiply two integers", py::arg("a"), py::arg("b"));
    basis.def("id", &operators::id<int>, "Identity function", py::arg("a"));
    basis.def("add", &operators::add<int>, "Add two integers", py::arg("a"), py::arg("b"));
    basis.def("neg", &operators::neg<int>, "Negate an integer", py::arg("a"));
    basis.def("lt", &operators::lt<int>, "Less than operator", py::arg("a"), py::arg("b"));
    basis.def("eq", &operators::eq<int>, "Equal operator", py::arg("a"), py::arg("b"));
    basis.def("max", &operators::max<int>, "Max operator", py::arg("a"), py::arg("b"));
    
    // task 2
    basis.def("is_close", &operators::is_close, "Check if two floats are close", py::arg("x"), py::arg("y"));
    basis.def("sigmoid", &operators::sigmoid, "Sigmoid function", py::arg("x"));
    basis.def("relu", &operators::relu, "ReLU function", py::arg("x"));
    basis.def("inv", &operators::inv, "Inverse function", py::arg("x"));
    basis.def("inv_back", &operators::inv_back, "Inv back function", py::arg("x"), py::arg("d"));
    basis.def("relu_back", &operators::relu_back, "ReLU back function", py::arg("x"), py::arg("d"));

    // task 3
    basis.def("negList", &operators::negList, "Negate a list of integers", py::arg("lst"));

    // task 4, 5
    basis.def("addLists", &operators::addLists, "Add two lists of integers", py::arg("lst1"), py::arg("lst2"));

    // task 6
    basis.def("sumList", &operators::sumList, "Sum a list of integers", py::arg("lst"));

    // task 7
    basis.def("prodList", &operators::prodList, "Multiply a list of integers", py::arg("lst"));

    py::module autodiff = framework.def_submodule("autodiff", "Autodiff modules");
    autodiff.def("test_central_difference", &autodiff::test_central_difference, "Test central difference");
    
    autodiff.def("test_addscalar", &autodiff::test_addscalar, "Test add scalar");

    autodiff.def("test_mulscalar", &autodiff::test_mulscalar, "Test mul scalar");

    autodiff.def("test_logscalar", &autodiff::test_logscalar, "Test log scalar");

    autodiff.def("test_invscalar", &autodiff::test_invscalar, "Test inv scalar");

    autodiff.def("test_sigmoidscalar", &autodiff::test_sigmoidscalar, "Test sigmoid scalar");
 }

--- a/frontend/framework/autodiff/test_task7.py
+++ b/frontend/framework/autodiff/test_task7.py
@@ -0,0 +1,16 @@
 from uctc.framework import autodiff
 import numpy as np
 from functools import reduce
 import random

 lst = [autodiff.test_central_difference, autodiff.test_addscalar, autodiff.test_mulscalar, autodiff.test_logscalar, autodiff.test_invscalar, autodiff.test_sigmoidscalar]
 for e in lst:
    if e():
        print(f"\033[1;34mPassed: {e.__name__} passed all tests\033[0m")
    else:
        print(f"\033[1;31mError: {e.__name__} failed test... expects true but gets false\033[0m")
        exit(0)



 print(f"\033[1;32m[PASSED] Task 3 finished!\033[0m")
--- a/frontend/framework/basis/config.py
+++ b/frontend/framework/basis/config.py
@@ -0,0 +1,2 @@
 # change this
 lib_path = "/home/hexu/learn/uc-modern-cpp-student/cc/build/"
--- a/frontend/framework/basis/test_task1.py
+++ b/frontend/framework/basis/test_task1.py
@@ -0,0 +1,46 @@
 import numpy as np
 import math
 from uctc.framework import basis
 binary_arguments = [
    (1, 2),
    (-2, 1),
    (1, 1),
    (2, -2),
    (1, 3),
    (3, 1),
    (-3, 3),
    (4, 5),
    (5, 4),
    (4, 4),
    (5, 5)
 ]

 singular_arguments = [
    1, 2, 4, -32, 42, 28, 0, 100, -1000, 10000, -100000
 ]

 def iterate_binary_arguments(func, std_func):
    for argument in binary_arguments:
        if func(*argument) != std_func(*argument):
            print(f"\033[1;31mError: {func.__name__}({argument}) = {func(*argument)} != {std_func.__name__}({argument}) = {std_func(*argument)}\033[0m")
            exit(0)
    print(f"\033[1;34mPassed: {func.__name__} passed all tests\033[0m")
    return True

 def iterate_singular_arguments(func, std_func):
    for argument in singular_arguments:
        if func(argument) != std_func(argument):
            print(f"\033[1;31mError: {func.__name__}({argument}) = {func(argument)} != {std_func.__name__}({argument}) = {std_func(argument)}\033[0m")
            exit(0)
    print(f"\033[1;34mPassed: {func.__name__} passed all tests\033[0m")
    return True

 # Test task 1
 iterate_binary_arguments(basis.mul, lambda x, y: x * y)
 iterate_singular_arguments(basis.id, lambda x: x)
 iterate_binary_arguments(basis.add, lambda x, y: x + y)
 iterate_singular_arguments(basis.neg, lambda x: -x)
 iterate_binary_arguments(basis.lt, lambda x, y: int(x < y))
 iterate_binary_arguments(basis.eq, lambda x, y: int(x == y))
 iterate_binary_arguments(basis.max, lambda x, y: max(x, y))
 print(f"\033[1;32m[PASSED] Task 1 finished!\033[0m")
--- a/frontend/framework/basis/test_task2.py
+++ b/frontend/framework/basis/test_task2.py
@@ -0,0 +1,55 @@
 from uctc.framework import basis
 import numpy as np
 import math

 binary_arguments = [
    (1.0, 2.0),
    (2.0, 1.0),
    (-1.0, 1.0),
    (2.0, -2.0),
    (1.0, 3.0),
    (3.0, -1.0),
    (3.0, 3.0),
    (-4.0, -5.0),
    (5.0, 4.0),
    (4.0, 4.0),
    (5.0, 5.0)
 ]

 singular_arguments = [
    1.0, -3.2, 4.3, 5.5, -6.7, 4.8, 3.33, 2.22, 1.11
 ]

 def is_close(x, y):
    return abs(x - y) < 1e-5

 def sigmoid(x):
    if x >= 0:
        return 1 / (1 + math.exp(-x))
    else:
        return math.exp(x) / (1 + math.exp(x))

 def iterate_binary_arguments(func, std_func):
    for argument in binary_arguments:
        if not is_close(func(*argument), std_func(*argument)):
            print(f"\033[1;31mError: {func.__name__}({argument}) = {func(*argument)} != {std_func.__name__}({argument}) = {std_func(*argument)}\033[0m")
            exit(0)
    print(f"\033[1;34mPassed: {func.__name__} passed all tests\033[0m")
    return True

 def iterate_singular_arguments(func, std_func):
    for argument in singular_arguments:
        if not is_close(func(argument), std_func(argument)):
            print(f"\033[1;31mError: {func.__name__}({argument}) = {func(argument)} != {std_func.__name__}({argument}) = {std_func(argument)}\033[0m")
            exit(0)
    print(f"\033[1;34mPassed: {func.__name__} passed all tests\033[0m")
    return True

 # Test task 1
 iterate_binary_arguments(basis.is_close, lambda x, y: 1.0*int(is_close(x, y)))
 iterate_singular_arguments(basis.sigmoid, lambda x: sigmoid(x))
 iterate_singular_arguments(basis.relu, lambda x: x if x > 0.0 else 0.0)
 iterate_singular_arguments(basis.inv, lambda x: 1.0/x)
 iterate_binary_arguments(basis.inv_back, lambda x, d: -d/(x*x))
 iterate_binary_arguments(basis.relu_back, lambda x, d: d * 1.0 if x > 0.0 else 0.0)
 print(f"\033[1;32m[PASSED] Task 2 finished!\033[0m")
--- a/frontend/framework/basis/test_task3.py
+++ b/frontend/framework/basis/test_task3.py
@@ -0,0 +1,20 @@
 from uctc.framework import basis
 import numpy as np
 import math
 import random

 def is_close(x, y):
    return abs(x - y) < 1e-5

 arr = [random.random() for i in range(128)]

 test_x = basis.negList(arr)

 test_y = [-e for e in arr]

 for i, (x, y) in enumerate(zip(test_x, test_y)):
    if not is_close(x, y):
        print(f"\033[1;31mError: {basis.negList.__name__} failed test at position {i}, expects {y} but gets {x}\033[0m")
        exit(0)
 print(f"\033[1;34mPassed: {basis.negList.__name__} passed all tests\033[0m")
 print(f"\033[1;32m[PASSED] Task 3 finished!\033[0m")
--- a/frontend/framework/basis/test_task4_5.py
+++ b/frontend/framework/basis/test_task4_5.py
@@ -0,0 +1,21 @@
 from uctc.framework import basis
 import numpy as np
 import math
 import random

 def is_close(x, y):
    return abs(x - y) < 1e-5

 arr_a = [random.random() for i in range(128)]
 arr_b = [random.random() for i in range(128)]

 test_x = basis.addLists(arr_a, arr_b)

 test_y = [e1 + e2 for e1, e2 in zip(arr_a, arr_b)]

 for i, (x, y) in enumerate(zip(test_x, test_y)):
    if not is_close(x, y):
        print(f"\033[1;31mError: {basis.addLists.__name__} failed test at position {i}, expects {y} but gets {x}\033[0m")
        exit(0)
 print(f"\033[1;34mPassed: {basis.addLists.__name__} passed all tests\033[0m")
 print(f"\033[1;32m[PASSED] Task 4 finished!\033[0m")
--- a/frontend/framework/basis/test_task6.py
+++ b/frontend/framework/basis/test_task6.py
@@ -0,0 +1,30 @@
 from uctc.framework import basis
 import numpy as np
 from functools import reduce
 import random

 def is_close(x, y):
    return abs(x - y) < 1e-3

 arr = [random.random() for i in range(128)]

 test_x1 = basis.sumList(arr)

 test_x2 = basis.prodList(arr)

 test_y1 = reduce(lambda x, y: x + y, arr, 0.0)

 test_y2 = reduce(lambda x, y: x * y, arr, 1.0)


 if not is_close(test_x1, test_y1):
    print(f"\033[1;31mError: {basis.sumList.__name__} failed test... expects {test_y1} but gets {test_x1}\033[0m")
    exit(0)
 print(f"\033[1;34mPassed: {basis.sumList.__name__} passed all tests\033[0m")

 if not is_close(test_x2, test_y2):
    print(f"\033[1;31mError: {basis.prodList.__name__} failed test... expects {test_y2} but gets {test_x2}\033[0m")
    exit(0)
 print(f"\033[1;34mPassed: {basis.prodList.__name__} passed all tests\033[0m")

 print(f"\033[1;32m[PASSED] Task 3 finished!\033[0m")
--- a/frontend/framework/tensor/task13_14.py
+++ b/frontend/framework/tensor/task13_14.py
@@ -0,0 +1,41 @@
 import numpy as np

 import uctc.nn as nn

 tensor1 = np.random.rand(42, 48)

 tensor2 = nn.pyarray_to_tensor(tensor1)

 t_tensor1 = tensor1.transpose()

 t_tensor2 = tensor2.transpose()

 t_2data = t_tensor2.data()

 t_1data = t_tensor1.flatten().tolist()

 def is_close(x, y):
    return abs(x - y) < 1e-5

 for i in range(len(t_1data)):
    if not is_close(t_1data[i], t_2data[i]):
        print(f"\033[1;31mTask 13 Error: t1 data[{i}] != t2 data[{i}]\033[0m")
        exit(0)

 at2 = nn.argmax(tensor2, 0).data()
 at1 = np.argmax(tensor1, 0).flatten().tolist()

 for i in range(len(at1)):
    if not is_close(at1[i], at2[i]):
        print(f"\033[1;31mTask 14 Error: at1 data[{i}] != at2 data[{i}]\033[0m")
        exit(0)

 at4 = nn.argmax(tensor2, 1).data()
 at3 = np.argmax(tensor1, 1).flatten().tolist()

 for i in range(len(at1)):
    if not is_close(at1[i], at2[i]):
        print(f"\033[1;31mTask 14 Error: at3 data[{i}] != at4 data[{i}]\033[0m")
        exit(0)

 print(f"\033[1;32m[PASSED] Task 13-14 finished!\033[0m")
--- a/frontend/mnist/autofrader.py
+++ b/frontend/mnist/autofrader.py
@@ -0,0 +1,579 @@
 # A custom autograder for this project

 ################################################################################
 # A mini-framework for autograding
 ################################################################################

 import optparse
 import pickle
 import random
 import sys
 import traceback

 class WritableNull:
    def write(self, string):
        pass

    def flush(self):
        pass

 class Tracker(object):
    def __init__(self, questions, maxes, prereqs, mute_output):
        self.questions = questions
        self.maxes = maxes
        self.prereqs = prereqs

        self.points = {q: 0 for q in self.questions}

        self.current_question = None

        self.current_test = None
        self.points_at_test_start = None
        self.possible_points_remaining = None

        self.mute_output = mute_output
        self.original_stdout = None
        self.muted = False

    def mute(self):
        if self.muted:
            return

        self.muted = True
        self.original_stdout = sys.stdout
        sys.stdout = WritableNull()

    def unmute(self):
        if not self.muted:
            return

        self.muted = False
        sys.stdout = self.original_stdout

    def begin_q(self, q):
        assert q in self.questions
        text = 'Question {}'.format(q)
        print('\n' + text)
        print('=' * len(text))

        for prereq in sorted(self.prereqs[q]):
            if self.points[prereq] < self.maxes[prereq]:
                print("""*** NOTE: Make sure to complete Question {} before working on Question {},
 *** because Question {} builds upon your answer for Question {}.
 """.format(prereq, q, q, prereq))
                return False

        self.current_question = q
        self.possible_points_remaining = self.maxes[q]
        return True

    def begin_test(self, test_name):
        self.current_test = test_name
        self.points_at_test_start = self.points[self.current_question]
        print("*** {}) {}".format(self.current_question, self.current_test))
        if self.mute_output:
            self.mute()

    def end_test(self, pts):
        if self.mute_output:
            self.unmute()
        self.possible_points_remaining -= pts
        if self.points[self.current_question] == self.points_at_test_start + pts:
            print("*** PASS: {}".format(self.current_test))
        elif self.points[self.current_question] == self.points_at_test_start:
            print("*** FAIL")

        self.current_test = None
        self.points_at_test_start = None

    def end_q(self):
        assert self.current_question is not None
        assert self.possible_points_remaining == 0
        print('\n### Question {}: {}/{} ###'.format(
            self.current_question,
            self.points[self.current_question],
            self.maxes[self.current_question]))

        self.current_question = None
        self.possible_points_remaining = None

    def finalize(self):
        import time
        print('\nFinished at %d:%02d:%02d' % time.localtime()[3:6])
        print("\nProvisional grades\n==================")

        for q in self.questions:
          print('Question %s: %d/%d' % (q, self.points[q], self.maxes[q]))
        print('------------------')
        print('Total: %d/%d' % (sum(self.points.values()),
            sum([self.maxes[q] for q in self.questions])))

        print("""
 Your grades are NOT yet registered.  To register your grades, make sure
 to follow your instructor's guidelines to receive credit on your project.
 """)

    def add_points(self, pts):
        self.points[self.current_question] += pts

 TESTS = []
 PREREQS = {}
 def add_prereq(q, pre):
    if isinstance(pre, str):
        pre = [pre]

    if q not in PREREQS:
        PREREQS[q] = set()
    PREREQS[q] |= set(pre)

 def test(q, points):
    def deco(fn):
        TESTS.append((q, points, fn))
        return fn
    return deco

 def parse_options(argv):
    parser = optparse.OptionParser(description = 'Run public tests on student code')
    parser.set_defaults(
        edx_output=False,
        gs_output=False,
        no_graphics=False,
        mute_output=False,
        check_dependencies=False,
        )
    parser.add_option('--edx-output',
                        dest = 'edx_output',
                        action = 'store_true',
                        help = 'Ignored, present for compatibility only')
    parser.add_option('--gradescope-output',
                        dest = 'gs_output',
                        action = 'store_true',
                        help = 'Ignored, present for compatibility only')
    parser.add_option('--question', '-q',
                        dest = 'grade_question',
                        default = None,
                        help = 'Grade only one question (e.g. `-q q1`)')
    parser.add_option('--no-graphics',
                        dest = 'no_graphics',
                        action = 'store_true',
                        help = 'Do not display graphics (visualizing your implementation is highly recommended for debugging).')
    parser.add_option('--mute',
                        dest = 'mute_output',
                        action = 'store_true',
                        help = 'Mute output from executing tests')
    parser.add_option('--check-dependencies',
                        dest = 'check_dependencies',
                        action = 'store_true',
                        help = 'check that numpy and matplotlib are installed')
    (options, args) = parser.parse_args(argv)
    return options

 def main():
    options = parse_options(sys.argv)
    if options.check_dependencies:
        check_dependencies()
        return

    if options.no_graphics:
        disable_graphics()

    questions = set()
    maxes = {}
    for q, points, fn in TESTS:
        questions.add(q)
        maxes[q] = maxes.get(q, 0) + points
        if q not in PREREQS:
            PREREQS[q] = set()

    questions = list(sorted(questions))
    if options.grade_question:
        if options.grade_question not in questions:
            print("ERROR: question {} does not exist".format(options.grade_question))
            sys.exit(1)
        else:
            questions = [options.grade_question]
            PREREQS[options.grade_question] = set()

    tracker = Tracker(questions, maxes, PREREQS, options.mute_output)
    for q in questions:
        started = tracker.begin_q(q)
        if not started:
            continue

        for testq, points, fn in TESTS:
            if testq != q:
                continue
            tracker.begin_test(fn.__name__)
            try:
                fn(tracker)
            except KeyboardInterrupt:
                tracker.unmute()
                print("\n\nCaught KeyboardInterrupt: aborting autograder")
                tracker.finalize()
                print("\n[autograder was interrupted before finishing]")
                sys.exit(1)
            except:
                tracker.unmute()
                print(traceback.format_exc())
            tracker.end_test(points)
        tracker.end_q()
    tracker.finalize()

 ################################################################################
 # Tests begin here
 ################################################################################

 import numpy as np
 import matplotlib
 import contextlib

 import nn
 import backend

 def check_dependencies():
    import matplotlib.pyplot as plt
    import time
    fig, ax = plt.subplots(1, 1)
    ax.set_xlim([-1, 1])
    ax.set_ylim([-1, 1])
    line, = ax.plot([], [], color="black")
    plt.show(block=False)

    for t in range(400):
        angle = t * 0.05
        x = np.sin(angle)
        y = np.cos(angle)
        line.set_data([x,-x], [y,-y])
        fig.canvas.draw_idle()
        fig.canvas.start_event_loop(1e-3)

 def disable_graphics():
    backend.use_graphics = False

@contextlib.contextmanager
 def no_graphics():
    old_use_graphics = backend.use_graphics
    backend.use_graphics = False
    yield
    backend.use_graphics = old_use_graphics

 def verify_node(node, expected_type, expected_shape, method_name):
    if expected_type == 'parameter':
        assert node is not None, (
            "{} should return an instance of nn.Parameter, not None".format(method_name))
        assert isinstance(node, nn.Parameter), (
            "{} should return an instance of nn.Parameter, instead got type {!r}".format(
            method_name, type(node).__name__))
    elif expected_type == 'loss':
        assert node is not None, (
            "{} should return an instance a loss node, not None".format(method_name))
        assert isinstance(node, (nn.SquareLoss, nn.SoftmaxLoss)), (
            "{} should return a loss node, instead got type {!r}".format(
            method_name, type(node).__name__))
    elif expected_type == 'node':
        assert node is not None, (
            "{} should return a node object, not None".format(method_name))
        assert isinstance(node, nn.Node), (
            "{} should return a node object, instead got type {!r}".format(
            method_name, type(node).__name__))
    else:
        assert False, "If you see this message, please report a bug in the autograder"

    if expected_type != 'loss':
        assert all([(expected is '?' or actual == expected) for (actual, expected) in zip(node.data.shape, expected_shape)]), (
            "{} should return an object with shape {}, got {}".format(
                method_name, nn.format_shape(expected_shape), nn.format_shape(node.data.shape)))

 def trace_node(node_to_trace):
    """
    Returns a set containing the node and all ancestors in the computation graph
    """
    nodes = set()
    tape = []

    def visit(node):
        if node not in nodes:
            for parent in node.parents:
                visit(parent)
            nodes.add(node)
            tape.append(node)

    visit(node_to_trace)

    return nodes

@test('q1', points=6)
 def check_perceptron(tracker):
    import models

    print("Sanity checking perceptron...")
    np_random = np.random.RandomState(0)
    # Check that the perceptron weights are initialized to a vector with `dimensions` entries.
    for dimensions in range(1, 10):
        p = models.PerceptronModel(dimensions)
        p_weights = p.get_weights()
        verify_node(p_weights, 'parameter', (1, dimensions), "PerceptronModel.get_weights()")

    # Check that run returns a node, and that the score in the node is correct
    for dimensions in range(1, 10):
        p = models.PerceptronModel(dimensions)
        p_weights = p.get_weights()
        verify_node(p_weights, 'parameter', (1, dimensions), "PerceptronModel.get_weights()")
        point = np_random.uniform(-10, 10, (1, dimensions))
        score = p.run(nn.Constant(point))
        verify_node(score, 'node', (1, 1), "PerceptronModel.run()")
        calculated_score = nn.as_scalar(score)
        expected_score = float(np.dot(point.flatten(), p_weights.data.flatten()))
        assert np.isclose(calculated_score, expected_score), (
            "The score computed by PerceptronModel.run() ({:.4f}) does not match the expected score ({:.4f})".format(
            calculated_score, expected_score))

    # Check that get_prediction returns the correct values, including the
    # case when a point lies exactly on the decision boundary
    for dimensions in range(1, 10):
        p = models.PerceptronModel(dimensions)
        random_point = np_random.uniform(-10, 10, (1, dimensions))
        for point in (random_point, np.zeros_like(random_point)):
            prediction = p.get_prediction(nn.Constant(point))
            assert prediction == 1 or prediction == -1, (
                "PerceptronModel.get_prediction() should return 1 or -1, not {}".format(
                prediction))

            expected_prediction = np.asscalar(np.where(np.dot(point, p.get_weights().data.T) >= 0, 1, -1))
            assert prediction == expected_prediction, (
                "PerceptronModel.get_prediction() returned {}; expected {}".format(
                    prediction, expected_prediction))

    tracker.add_points(2) # Partial credit for passing sanity checks

    print("Sanity checking perceptron weight updates...")

    # Test weight updates. This involves constructing a dataset that
    # requires 0 or 1 updates before convergence, and testing that weight
    # values change as expected. Note that (multiplier < -1 or multiplier > 1)
    # must be true for the testing code to be correct.
    dimensions = 2
    for multiplier in (-5, -2, 2, 5):
        p = models.PerceptronModel(dimensions)
        orig_weights = p.get_weights().data.reshape((1, dimensions)).copy()
        if np.abs(orig_weights).sum() == 0.0:
            # This autograder test doesn't work when weights are exactly zero
            continue
        point = multiplier * orig_weights
        sanity_dataset = backend.Dataset(
            x=np.tile(point, (500, 1)),
            y=np.ones((500, 1)) * -1.0
        )
        p.train(sanity_dataset)
        new_weights = p.get_weights().data.reshape((1, dimensions))

        if multiplier < 0:
            expected_weights = orig_weights
        else:
            expected_weights = orig_weights - point

        if not np.all(new_weights == expected_weights):
            print()
            print("Initial perceptron weights were: [{:.4f}, {:.4f}]".format(
                orig_weights[0,0], orig_weights[0,1]))
            print("All data points in the dataset were identical and had:")
            print("    x = [{:.4f}, {:.4f}]".format(
                point[0,0], point[0,1]))
            print("    y = -1")
            print("Your trained weights were: [{:.4f}, {:.4f}]".format(
                new_weights[0,0], new_weights[0,1]))
            print("Expected weights after training: [{:.4f}, {:.4f}]".format(
                expected_weights[0,0], expected_weights[0,1]))
            print()
            assert False, "Weight update sanity check failed"

    print("Sanity checking complete. Now training perceptron")
    model = models.PerceptronModel(3)
    dataset = backend.PerceptronDataset(model)

    model.train(dataset)
    backend.maybe_sleep_and_close(1)

    assert dataset.epoch != 0, "Perceptron code never iterated over the training data"

    accuracy = np.mean(np.where(np.dot(dataset.x, model.get_weights().data.T) >= 0.0, 1.0, -1.0) == dataset.y)
    if accuracy < 1.0:
        print("The weights learned by your perceptron correctly classified {:.2%} of training examples".format(accuracy))
        print("To receive full points for this question, your perceptron must converge to 100% accuracy")
        return

    tracker.add_points(4)

@test('q2', points=6)
 def check_regression(tracker):
    import models
    model = models.RegressionModel()
    dataset = backend.RegressionDataset(model)

    detected_parameters = None
    for batch_size in (1, 2, 4):
        inp_x = nn.Constant(dataset.x[:batch_size])
        inp_y = nn.Constant(dataset.y[:batch_size])
        output_node = model.run(inp_x)
        verify_node(output_node, 'node', (batch_size, 1), "RegressionModel.run()")
        trace = trace_node(output_node)
        assert inp_x in trace, "Node returned from RegressionModel.run() does not depend on the provided input (x)"

        if detected_parameters is None:
            detected_parameters = [node for node in trace if isinstance(node, nn.Parameter)]

        for node in trace:
            assert not isinstance(node, nn.Parameter) or node in detected_parameters, (
                "Calling RegressionModel.run() multiple times should always re-use the same parameters, but a new nn.Parameter object was detected")

    for batch_size in (1, 2, 4):
        inp_x = nn.Constant(dataset.x[:batch_size])
        inp_y = nn.Constant(dataset.y[:batch_size])
        loss_node = model.get_loss(inp_x, inp_y)
        verify_node(loss_node, 'loss', None, "RegressionModel.get_loss()")
        trace = trace_node(loss_node)
        assert inp_x in trace, "Node returned from RegressionModel.get_loss() does not depend on the provided input (x)"
        assert inp_y in trace, "Node returned from RegressionModel.get_loss() does not depend on the provided labels (y)"

        for node in trace:
            assert not isinstance(node, nn.Parameter) or node in detected_parameters, (
                "RegressionModel.get_loss() should not use additional parameters not used by RegressionModel.run()")

    tracker.add_points(2) # Partial credit for passing sanity checks

    model.train(dataset)
    backend.maybe_sleep_and_close(1)

    train_loss = model.get_loss(nn.Constant(dataset.x), nn.Constant(dataset.y))
    verify_node(train_loss, 'loss', None, "RegressionModel.get_loss()")
    train_loss = nn.as_scalar(train_loss)

    # Re-compute the loss ourselves: otherwise get_loss() could be hard-coded
    # to always return zero
    train_predicted = model.run(nn.Constant(dataset.x))
    verify_node(train_predicted, 'node', (dataset.x.shape[0], 1), "RegressionModel.run()")
    sanity_loss = 0.5 * np.mean((train_predicted.data - dataset.y)**2)

    assert np.isclose(train_loss, sanity_loss), (
        "RegressionModel.get_loss() returned a loss of {:.4f}, "
        "but the autograder computed a loss of {:.4f} "
        "based on the output of RegressionModel.run()".format(
            train_loss, sanity_loss))

    loss_threshold = 0.02
    if train_loss <= loss_threshold:
        print("Your final loss is: {:f}".format(train_loss))
        tracker.add_points(4)
    else:
        print("Your final loss ({:f}) must be no more than {:.4f} to receive full points for this question".format(train_loss, loss_threshold))

@test('q3', points=6)
 def check_digit_classification(tracker):
    import models
    model = models.DigitClassificationModel()
    dataset = backend.DigitClassificationDataset(model)

    detected_parameters = None
    for batch_size in (1, 2, 4):
        inp_x = nn.Constant(dataset.x[:batch_size])
        inp_y = nn.Constant(dataset.y[:batch_size])
        output_node = model.run(inp_x)
        verify_node(output_node, 'node', (batch_size, 10), "DigitClassificationModel.run()")
        trace = trace_node(output_node)
        assert inp_x in trace, "Node returned from DigitClassificationModel.run() does not depend on the provided input (x)"

        if detected_parameters is None:
            detected_parameters = [node for node in trace if isinstance(node, nn.Parameter)]

        for node in trace:
            assert not isinstance(node, nn.Parameter) or node in detected_parameters, (
                "Calling DigitClassificationModel.run() multiple times should always re-use the same parameters, but a new nn.Parameter object was detected")

    for batch_size in (1, 2, 4):
        inp_x = nn.Constant(dataset.x[:batch_size])
        inp_y = nn.Constant(dataset.y[:batch_size])
        loss_node = model.get_loss(inp_x, inp_y)
        verify_node(loss_node, 'loss', None, "DigitClassificationModel.get_loss()")
        trace = trace_node(loss_node)
        assert inp_x in trace, "Node returned from DigitClassificationModel.get_loss() does not depend on the provided input (x)"
        assert inp_y in trace, "Node returned from DigitClassificationModel.get_loss() does not depend on the provided labels (y)"

        for node in trace:
            assert not isinstance(node, nn.Parameter) or node in detected_parameters, (
                "DigitClassificationModel.get_loss() should not use additional parameters not used by DigitClassificationModel.run()")

    tracker.add_points(2) # Partial credit for passing sanity checks

    model.train(dataset)

    test_logits = model.run(nn.Constant(dataset.test_images)).data
    test_predicted = np.argmax(test_logits, axis=1)
    test_accuracy = np.mean(test_predicted == dataset.test_labels)

    accuracy_threshold = 0.97
    if test_accuracy >= accuracy_threshold:
        print("Your final test set accuracy is: {:%}".format(test_accuracy))
        tracker.add_points(4)
    else:
        print("Your final test set accuracy ({:%}) must be at least {:.0%} to receive full points for this question".format(test_accuracy, accuracy_threshold))

@test('q4', points=7)
 def check_lang_id(tracker):
    import models
    model = models.LanguageIDModel()
    dataset = backend.LanguageIDDataset(model)

    detected_parameters = None
    for batch_size, word_length in ((1, 1), (2, 1), (2, 6), (4, 8)):
        start = dataset.dev_buckets[-1, 0]
        end = start + batch_size
        inp_xs, inp_y = dataset._encode(dataset.dev_x[start:end], dataset.dev_y[start:end])
        inp_xs = inp_xs[:word_length]

        output_node = model.run(inp_xs)
        verify_node(output_node, 'node', (batch_size, len(dataset.language_names)), "LanguageIDModel.run()")
        trace = trace_node(output_node)
        for inp_x in inp_xs:
            assert inp_x in trace, "Node returned from LanguageIDModel.run() does not depend on all of the provided inputs (xs)"

        # Word length 1 does not use parameters related to transferring the
        # hidden state across timesteps, so initial parameter detection is only
        # run for longer words
        if word_length > 1:
            if detected_parameters is None:
                detected_parameters = [node for node in trace if isinstance(node, nn.Parameter)]

            for node in trace:
                assert not isinstance(node, nn.Parameter) or node in detected_parameters, (
                    "Calling LanguageIDModel.run() multiple times should always re-use the same parameters, but a new nn.Parameter object was detected")

    for batch_size, word_length in ((1, 1), (2, 1), (2, 6), (4, 8)):
        start = dataset.dev_buckets[-1, 0]
        end = start + batch_size
        inp_xs, inp_y = dataset._encode(dataset.dev_x[start:end], dataset.dev_y[start:end])
        inp_xs = inp_xs[:word_length]
        loss_node = model.get_loss(inp_xs, inp_y)
        trace = trace_node(loss_node)
        for inp_x in inp_xs:
            assert inp_x in trace, "Node returned from LanguageIDModel.run() does not depend on all of the provided inputs (xs)"
        assert inp_y in trace, "Node returned from LanguageIDModel.get_loss() does not depend on the provided labels (y)"

        for node in trace:
            assert not isinstance(node, nn.Parameter) or node in detected_parameters, (
                "LanguageIDModel.get_loss() should not use additional parameters not used by LanguageIDModel.run()")

    tracker.add_points(2) # Partial credit for passing sanity checks

    model.train(dataset)

    test_predicted_probs, test_predicted, test_correct = dataset._predict('test')
    test_accuracy = np.mean(test_predicted == test_correct)
    accuracy_threshold = 0.81
    if test_accuracy >= accuracy_threshold:
        print("Your final test set accuracy is: {:%}".format(test_accuracy))
        tracker.add_points(5)
    else:
        print("Your final test set accuracy ({:%}) must be at least {:.0%} to receive full points for this question".format(test_accuracy, accuracy_threshold))

 if __name__ == '__main__':
    main()
--- a/frontend/mnist/backend.py
+++ b/frontend/mnist/backend.py
@@ -0,0 +1,449 @@
 import collections
 import os
 import time
 import os

 import matplotlib.pyplot as plt
 import numpy as np

 import nn

 use_graphics = True

 def maybe_sleep_and_close(seconds):
    if use_graphics and plt.get_fignums():
        time.sleep(seconds)
        for fignum in plt.get_fignums():
            fig = plt.figure(fignum)
            plt.close(fig)
            try:
                # This raises a TclError on some Windows machines
                fig.canvas.start_event_loop(1e-3)
            except:
                pass

 def get_data_path(filename):
    path = os.path.join(
        os.path.dirname(__file__), os.pardir, "data", filename)
    if not os.path.exists(path):
        path = os.path.join(
            os.path.dirname(__file__), "data", filename)
    if not os.path.exists(path):
        path = os.path.join(
            os.path.dirname(__file__), filename)
    if not os.path.exists(path):
        raise Exception("Could not find data file: {}".format(filename))
    return path

 class Dataset(object):
    def __init__(self, x, y):
        assert isinstance(x, np.ndarray)
        assert isinstance(y, np.ndarray)
        assert np.issubdtype(x.dtype, np.floating)
        assert np.issubdtype(y.dtype, np.floating)
        assert x.ndim == 2
        assert y.ndim == 2
        assert x.shape[0] == y.shape[0]
        self.x = x
        self.y = y

    def iterate_once(self, batch_size):
        assert isinstance(batch_size, int) and batch_size > 0, (
            "Batch size should be a positive integer, got {!r}".format(
                batch_size))
        assert self.x.shape[0] % batch_size == 0, (
            "Dataset size {:d} is not divisible by batch size {:d}".format(
                self.x.shape[0], batch_size))
        index = 0
        while index < self.x.shape[0]:
            x = self.x[index:index + batch_size]
            y = self.y[index:index + batch_size]
            yield nn.Constant(x), nn.Constant(y)
            index += batch_size

    def iterate_forever(self, batch_size):
        while True:
            yield from self.iterate_once(batch_size)

    def get_validation_accuracy(self):
        raise NotImplementedError(
            "No validation data is available for this dataset. "
            "In this assignment, only the Digit Classification and Language "
            "Identification datasets have validation data.")

 class PerceptronDataset(Dataset):
    def __init__(self, model):
        points = 500
        x = np.hstack([np.random.randn(points, 2), np.ones((points, 1))])
        y = np.where(x[:, 0] + 2 * x[:, 1] - 1 >= 0, 1.0, -1.0)
        super().__init__(x, np.expand_dims(y, axis=1))

        self.model = model
        self.epoch = 0

        if use_graphics:
            fig, ax = plt.subplots(1, 1)
            limits = np.array([-3.0, 3.0])
            ax.set_xlim(limits)
            ax.set_ylim(limits)
            positive = ax.scatter(*x[y == 1, :-1].T, color="red", marker="+")
            negative = ax.scatter(*x[y == -1, :-1].T, color="blue", marker="_")
            line, = ax.plot([], [], color="black")
            text = ax.text(0.03, 0.97, "", transform=ax.transAxes, va="top")
            ax.legend([positive, negative], [1, -1])
            plt.show(block=False)

            self.fig = fig
            self.limits = limits
            self.line = line
            self.text = text
            self.last_update = time.time()

    def iterate_once(self, batch_size):
        self.epoch += 1

        for i, (x, y) in enumerate(super().iterate_once(batch_size)):
            yield x, y

            if use_graphics and time.time() - self.last_update > 0.01:
                w = self.model.get_weights().data.flatten()
                limits = self.limits
                if w[1] != 0:
                    self.line.set_data(limits, (-w[0] * limits - w[2]) / w[1])
                elif w[0] != 0:
                    self.line.set_data(np.full(2, -w[2] / w[0]), limits)
                else:
                    self.line.set_data([], [])
                self.text.set_text(
                    "epoch: {:,}\npoint: {:,}/{:,}\nweights: {}".format(
                        self.epoch, i * batch_size + 1, len(self.x), w))
                self.fig.canvas.draw_idle()
                self.fig.canvas.start_event_loop(1e-3)
                self.last_update = time.time()

 class RegressionDataset(Dataset):
    def __init__(self, model):
        x = np.expand_dims(np.linspace(-2 * np.pi, 2 * np.pi, num=200), axis=1)
        np.random.RandomState(0).shuffle(x)
        self.argsort_x = np.argsort(x.flatten())
        y = np.sin(x)
        super().__init__(x, y)

        self.model = model
        self.processed = 0

        if use_graphics:
            fig, ax = plt.subplots(1, 1)
            ax.set_xlim(-2 * np.pi, 2 * np.pi)
            ax.set_ylim(-1.4, 1.4)
            real, = ax.plot(x[self.argsort_x], y[self.argsort_x], color="blue")
            learned, = ax.plot([], [], color="red")
            text = ax.text(0.03, 0.97, "", transform=ax.transAxes, va="top")
            ax.legend([real, learned], ["real", "learned"])
            plt.show(block=False)

            self.fig = fig
            self.learned = learned
            self.text = text
            self.last_update = time.time()

    def iterate_once(self, batch_size):
        for x, y in super().iterate_once(batch_size):
            yield x, y
            self.processed += batch_size

            if use_graphics and time.time() - self.last_update > 0.1:
                predicted = self.model.run(nn.Constant(self.x)).data
                loss = self.model.get_loss(
                    nn.Constant(self.x), nn.Constant(self.y)).data
                self.learned.set_data(self.x[self.argsort_x], predicted[self.argsort_x])
                self.text.set_text("processed: {:,}\nloss: {:.6f}".format(
                   self.processed, loss))
                self.fig.canvas.draw_idle()
                self.fig.canvas.start_event_loop(1e-3)
                self.last_update = time.time()

 class DigitClassificationDataset(Dataset):
    def __init__(self, model):
        mnist_path = get_data_path("mnist.npz")

        with np.load(mnist_path) as data:
            train_images = data["train_images"]
            train_labels = data["train_labels"]
            test_images = data["test_images"]
            test_labels = data["test_labels"]
            assert len(train_images) == len(train_labels) == 60000
            assert len(test_images) == len(test_labels) == 10000
            self.dev_images = test_images[0::2]
            self.dev_labels = test_labels[0::2]
            self.test_images = test_images[1::2]
            self.test_labels = test_labels[1::2]

        train_labels_one_hot = np.zeros((len(train_images), 10))
        train_labels_one_hot[range(len(train_images)), train_labels] = 1

        super().__init__(train_images, train_labels_one_hot)

        self.model = model
        self.epoch = 0

        if use_graphics:
            width = 20  # Width of each row expressed as a multiple of image width
            samples = 100  # Number of images to display per label
            fig = plt.figure()
            ax = {}
            images = collections.defaultdict(list)
            texts = collections.defaultdict(list)
            for i in reversed(range(10)):
                ax[i] = plt.subplot2grid((30, 1), (3 * i, 0), 2, 1,
                                         sharex=ax.get(9))
                plt.setp(ax[i].get_xticklabels(), visible=i == 9)
                ax[i].set_yticks([])
                ax[i].text(-0.03, 0.5, i, transform=ax[i].transAxes,
                           va="center")
                ax[i].set_xlim(0, 28 * width)
                ax[i].set_ylim(0, 28)
                for j in range(samples):
                    images[i].append(ax[i].imshow(
                        np.zeros((28, 28)), vmin=0, vmax=1, cmap="Greens",
                        alpha=0.3))
                    texts[i].append(ax[i].text(
                        0, 0, "", ha="center", va="top", fontsize="smaller"))
            ax[9].set_xticks(np.linspace(0, 28 * width, 11))
            ax[9].set_xticklabels(
                ["{:.1f}".format(num) for num in np.linspace(0, 1, 11)])
            ax[9].tick_params(axis="x", pad=16)
            ax[9].set_xlabel("Probability of Correct Label")
            status = ax[0].text(
                0.5, 1.5, "", transform=ax[0].transAxes, ha="center",
                va="bottom")
            plt.show(block=False)

            self.width = width
            self.samples = samples
            self.fig = fig
            self.images = images
            self.texts = texts
            self.status = status
            self.last_update = time.time()

    def iterate_once(self, batch_size):
        self.epoch += 1

        for i, (x, y) in enumerate(super().iterate_once(batch_size)):
            yield x, y

            if use_graphics and time.time() - self.last_update > 1:
                dev_logits = self.model.run(nn.Constant(self.dev_images)).data
                dev_predicted = np.argmax(dev_logits, axis=1)
                dev_probs = np.exp(nn.SoftmaxLoss.log_softmax(dev_logits))
                dev_accuracy = np.mean(dev_predicted == self.dev_labels)

                self.status.set_text(
                    "epoch: {:d}, batch: {:d}/{:d}, validation accuracy: "
                    "{:.2%}".format(
                        self.epoch, i, len(self.x) // batch_size, dev_accuracy))
                for i in range(10):
                    predicted = dev_predicted[self.dev_labels == i]
                    probs = dev_probs[self.dev_labels == i][:, i]
                    linspace = np.linspace(
                        0, len(probs) - 1, self.samples).astype(int)
                    indices = probs.argsort()[linspace]
                    for j, (prob, image) in enumerate(zip(
                            probs[indices],
                            self.dev_images[self.dev_labels == i][indices])):
                        self.images[i][j].set_data(image.reshape((28, 28)))
                        left = prob * (self.width - 1) * 28
                        if predicted[indices[j]] == i:
                            self.images[i][j].set_cmap("Greens")
                            self.texts[i][j].set_text("")
                        else:
                            self.images[i][j].set_cmap("Reds")
                            self.texts[i][j].set_text(predicted[indices[j]])
                            self.texts[i][j].set_x(left + 14)
                        self.images[i][j].set_extent([left, left + 28, 0, 28])
                self.fig.canvas.draw_idle()
                self.fig.canvas.start_event_loop(1e-3)
                self.last_update = time.time()

    def get_validation_accuracy(self):
        # print(self.dev_images[:2].tolist())
        dev_logits = self.model.run(nn.Constant(self.dev_images)).data
        # print(f"dev logits: {dev_logits.flatten()[10:20]}")
        dev_predicted = np.argmax(dev_logits, axis=1)
        dev_accuracy = np.mean(dev_predicted == self.dev_labels)
        return dev_accuracy

 class LanguageIDDataset(Dataset):
    def __init__(self, model):
        self.model = model

        data_path = get_data_path("lang_id.npz")

        with np.load(data_path) as data:
            self.chars = data['chars']
            self.language_codes = data['language_codes']
            self.language_names = data['language_names']

            self.train_x = data['train_x']
            self.train_y = data['train_y']
            self.train_buckets = data['train_buckets']
            self.dev_x = data['dev_x']
            self.dev_y = data['dev_y']
            self.dev_buckets = data['dev_buckets']
            self.test_x = data['test_x']
            self.test_y = data['test_y']
            self.test_buckets = data['test_buckets']

        self.epoch = 0
        self.bucket_weights = self.train_buckets[:,1] - self.train_buckets[:,0]
        self.bucket_weights = self.bucket_weights / float(self.bucket_weights.sum())

        self.chars_print = self.chars
        try:
            print(u"Alphabet: {}".format(u"".join(self.chars)))
        except UnicodeEncodeError:
            self.chars_print = "abcdefghijklmnopqrstuvwxyzaaeeeeiinoouuacelnszz"
            print("Alphabet: " + self.chars_print)
            self.chars_print = list(self.chars_print)
            print("""
 NOTE: Your terminal does not appear to support printing Unicode characters.
 For the purposes of printing to the terminal, some of the letters in the
 alphabet above have been substituted with ASCII symbols.""".strip())
        print("")

        # Select some examples to spotlight in the monitoring phase (3 per language)
        spotlight_idxs = []
        for i in range(len(self.language_names)):
            idxs_lang_i = np.nonzero(self.dev_y == i)[0]
            idxs_lang_i = np.random.choice(idxs_lang_i, size=3, replace=False)
            spotlight_idxs.extend(list(idxs_lang_i))
        self.spotlight_idxs = np.array(spotlight_idxs, dtype=int)

        # Templates for printing updates as training progresses
        max_word_len = self.dev_x.shape[1]
        max_lang_len = max([len(x) for x in self.language_names])

        self.predicted_template = u"Pred: {:<NUM}".replace('NUM',
            str(max_lang_len))

        self.word_template = u"  "
        self.word_template += u"{:<NUM} ".replace('NUM', str(max_word_len))
        self.word_template += u"{:<NUM} ({:6.1%})".replace('NUM', str(max_lang_len))
        self.word_template += u" {:<NUM} ".replace('NUM',
            str(max_lang_len + len('Pred: ')))
        for i in range(len(self.language_names)):
            self.word_template += u"|{}".format(self.language_codes[i])
            self.word_template += "{probs[" + str(i) + "]:4.0%}"

        self.last_update = time.time()

    def _encode(self, inp_x, inp_y):
        xs = []
        for i in range(inp_x.shape[1]):
            if np.all(inp_x[:,i] == -1):
                break
            assert not np.any(inp_x[:,i] == -1), (
                "Please report this error in the project: batching by length was done incorrectly in the provided code")
            x = np.eye(len(self.chars))[inp_x[:,i]]
            xs.append(nn.Constant(x))
        y = np.eye(len(self.language_names))[inp_y]
        y = nn.Constant(y)
        return xs, y

    def _softmax(self, x):
        exp = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp / np.sum(exp, axis=-1, keepdims=True)

    def _predict(self, split='dev'):
        if split == 'dev':
            data_x = self.dev_x
            data_y = self.dev_y
            buckets = self.dev_buckets
        else:
            data_x = self.test_x
            data_y = self.test_y
            buckets = self.test_buckets

        all_predicted = []
        all_correct = []
        for bucket_id in range(buckets.shape[0]):
            start, end = buckets[bucket_id]
            xs, y = self._encode(data_x[start:end], data_y[start:end])
            predicted = self.model.run(xs)

            all_predicted.extend(list(predicted.data))
            all_correct.extend(list(data_y[start:end]))

        all_predicted_probs = self._softmax(np.asarray(all_predicted))
        all_predicted = np.asarray(all_predicted).argmax(axis=-1)
        all_correct = np.asarray(all_correct)

        return all_predicted_probs, all_predicted, all_correct

    def iterate_once(self, batch_size):
        assert isinstance(batch_size, int) and batch_size > 0, (
            "Batch size should be a positive integer, got {!r}".format(
                batch_size))
        assert self.train_x.shape[0] >= batch_size, (
            "Dataset size {:d} is smaller than the batch size {:d}".format(
                self.train_x.shape[0], batch_size))

        self.epoch += 1

        for iteration in range(self.train_x.shape[0] // batch_size):
            bucket_id = np.random.choice(self.bucket_weights.shape[0], p=self.bucket_weights)
            example_ids = self.train_buckets[bucket_id, 0] + np.random.choice(
                self.train_buckets[bucket_id, 1] - self.train_buckets[bucket_id, 0],
                size=batch_size)

            yield self._encode(self.train_x[example_ids], self.train_y[example_ids])

            if use_graphics and time.time() - self.last_update > 0.5:
                dev_predicted_probs, dev_predicted, dev_correct = self._predict()
                dev_accuracy = np.mean(dev_predicted == dev_correct)

                print("epoch {:,} iteration {:,} validation-accuracy {:.1%}".format(
                    self.epoch, iteration, dev_accuracy))

                for idx in self.spotlight_idxs:
                    correct = (dev_predicted[idx] == dev_correct[idx])
                    word = u"".join([self.chars_print[ch] for ch in self.dev_x[idx] if ch != -1])

                    print(self.word_template.format(
                        word,
                        self.language_names[dev_correct[idx]],
                        dev_predicted_probs[idx, dev_correct[idx]],
                        "" if correct else self.predicted_template.format(
                            self.language_names[dev_predicted[idx]]),
                        probs=dev_predicted_probs[idx,:],
                    ))

                self.last_update = time.time()

    def get_validation_accuracy(self):
        dev_predicted_probs, dev_predicted, dev_correct = self._predict()
        dev_accuracy = np.mean(dev_predicted == dev_correct)
        return dev_accuracy


 def main():
    import models
    # model = models.PerceptronModel(3)
    # dataset = PerceptronDataset(model)
    # model.train(dataset)

    # model = models.RegressionModel()
    # dataset = RegressionDataset(model)
    # model.train(dataset)

    model = models.DigitClassificationModel()
    dataset = DigitClassificationDataset(model)
    model.train(dataset)

    # model = models.LanguageIDModel()
    # dataset = LanguageIDDataset(model)
    # model.train(dataset)

 if __name__ == "__main__":
    main()
--- a/frontend/mnist/data/lang_id.npz
+++ b/frontend/mnist/data/lang_id.npz
--- a/frontend/mnist/data/mnist.npz
+++ b/frontend/mnist/data/mnist.npz
--- a/frontend/mnist/models.py
+++ b/frontend/mnist/models.py
@@ -0,0 +1,292 @@
 import nn

 class PerceptronModel(object):
    def __init__(self, dimensions):
        """
        Initialize a new Perceptron instance.

        A perceptron classifies data points as either belonging to a particular
        class (+1) or not (-1). `dimensions` is the dimensionality of the data.
        For example, dimensions=2 would mean that the perceptron must classify
        2D points.
        """
        self.w = nn.Parameter(1, dimensions)

    def get_weights(self):
        """
        Return a Parameter instance with the current weights of the perceptron.
        """
        return self.w

    def run(self, x):
        """
        Calculates the score assigned by the perceptron to a data point x.

        Inputs:
            x: a node with shape (1 x dimensions)
        Returns: a node containing a single number (the score)
        """
        "*** YOUR CODE HERE ***"
        return nn.DotProduct(x, self.get_weights())

    def get_prediction(self, x):
        """
        Calculates the predicted class for a single data point `x`.

        Returns: 1 or -1
        """
        "*** YOUR CODE HERE ***"
        score = self.run(x)
        if nn.as_scalar(score) >= 0:
            return 1
        else:
            return -1

    def train(self, dataset):
        """
        Train the perceptron until convergence.
        """
        "*** YOUR CODE HERE ***"
        batch_size = 1

        while True:
            converged = True
            for x, y in dataset.iterate_once(batch_size):
                prediction = self.get_prediction(x)
                print(x, y)
                assert 0
                if prediction != nn.as_scalar(y):
                    converged = False
                    self.w.update(x, nn.as_scalar(y))
            if converged:
                break


 class RegressionModel(object):
    """
    A neural network model for approximating a function that maps from real
    numbers to real numbers. The network should be sufficiently large to be able
    to approximate sin(x) on the interval [-2pi, 2pi] to reasonable precision.
    """
    def __init__(self):
        # Initialize your model parameters here
        "*** YOUR CODE HERE ***"
        self.i = 1
        self.o = 1

        self.h = 50
        self.b = 10
        self.learning_rate = 0.01

        self.W1 = nn.Parameter(self.i, self.h)
        self.b1 = nn.Parameter(1, self.h)
        self.W2 = nn.Parameter(self.h, self.o)
        self.b2 = nn.Parameter(1, self.o)

    def run(self, x):
        """
        Runs the model for a batch of examples.

        Inputs:
            x: a node with shape (batch_size x 1)
        Returns:
            A node with shape (batch_size x 1) containing predicted y-values
        """
        "*** YOUR CODE HERE ***"
        layer_1 = nn.ReLU(nn.AddBias(nn.Linear(x, self.W1), self.b1))
        prediction = nn.AddBias(nn.Linear(layer_1, self.W2), self.b2)
        return prediction

    def get_loss(self, x, y):
        """
        Computes the loss for a batch of examples.

        Inputs:
            x: a node with shape (batch_size x 1)
            y: a node with shape (batch_size x 1), containing the true y-values
                to be used for training
        Returns: a loss node
        """
        "*** YOUR CODE HERE ***"
        return nn.SquareLoss(self.run(x), y)
        

    def train(self, dataset):
        """
        Trains the model.
        """
        "*** YOUR CODE HERE ***"
        for i in range(20):
            for x, y in dataset.iterate_once(self.b):
                loss = self.get_loss(x, y)
                print(loss.data)
                g_W1, g_b1, g_W2, g_b2 = nn.gradients(loss, [self.W1, self.b1, self.W2, self.b2])
                # print(g_W1.data)
                # print(g_b1.data)
                # print(g_W2.data)
                # print(g_b2.data)
                self.W1.update(g_W1, -self.learning_rate)
                self.b1.update(g_b1, -self.learning_rate)
                self.W2.update(g_W2, -self.learning_rate)
                self.b2.update(g_b2, -self.learning_rate)
            if loss.data < 0.01:
                break
        

 class DigitClassificationModel(object):
    """
    A model for handwritten digit classification using the MNIST dataset.

    Each handwritten digit is a 28x28 pixel grayscale image, which is flattened
    into a 784-dimensional vector for the purposes of this model. Each entry in
    the vector is a floating point number between 0 and 1.

    The goal is to sort each digit into one of 10 classes (number 0 through 9).

    (See RegressionModel for more information about the APIs of different
    methods here. We recommend that you implement the RegressionModel before
    working on this part of the project.)
    """
    def __init__(self):
        # Initialize your model parameters here
        "*** YOUR CODE HERE ***"
        self.input_features = 784
        self.h1 = 200
        self.h2 = 100
        self.output_features = 10
        self.lr = 0.01
        self.batch_size = 100
        self.w1 = nn.Parameter(self.input_features, self.h1)
        self.b1 = nn.Parameter(1, self.h1)
        self.w2 = nn.Parameter(self.h1, self.h2)
        self.b2 = nn.Parameter(1, self.h2)
        self.w3 = nn.Parameter(self.h2, self.output_features)
        self.b3 = nn.Parameter(1, self.output_features)

    def run(self, x):
        """
        Runs the model for a batch of examples.

        Your model should predict a node with shape (batch_size x 10),
        containing scores. Higher scores correspond to greater probability of
        the image belonging to a particular class.

        Inputs:
            x: a node with shape (batch_size x 784)
        Output:
            A node with shape (batch_size x 10) containing predicted scores
                (also called logits)
        """
        "*** YOUR CODE HERE ***"
        l1 = nn.ReLU(nn.AddBias(nn.Linear(x, self.w1), self.b1))
        l2 = nn.ReLU(nn.AddBias(nn.Linear(l1, self.w2), self.b2))
        l3 = nn.AddBias(nn.Linear(l2, self.w3), self.b3)
        return l3

    def get_loss(self, x, y):
        """
        Computes the loss for a batch of examples.

        The correct labels `y` are represented as a node with shape
        (batch_size x 10). Each row is a one-hot vector encoding the correct
        digit class (0-9).

        Inputs:
            x: a node with shape (batch_size x 784)
            y: a node with shape (batch_size x 10)
        Returns: a loss node
        """
        "*** YOUR CODE HERE ***"
        return nn.SoftmaxLoss(self.run(x), y)

    def train(self, dataset):
        """
        Trains the model.
        """
        "*** YOUR CODE HERE ***"
        while True:
            for x, y in dataset.iterate_once(self.batch_size):
                loss = self.get_loss(x, y)
                g_w1, g_b1, g_w2, g_b2, g_w3, g_b3 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2, self.w3, self.b3])
                self.w1.update(g_w1, -self.lr)
                self.b1.update(g_b1, -self.lr)
                self.w2.update(g_w2, -self.lr)
                self.b2.update(g_b2, -self.lr)
                self.w3.update(g_w3, -self.lr)
                self.b3.update(g_b3, -self.lr)
            accuracy = dataset.get_validation_accuracy()
            print(accuracy)
            if accuracy > 0.95:
                break

 class LanguageIDModel(object):
    """
    A model for language identification at a single-word granularity.

    (See RegressionModel for more information about the APIs of different
    methods here. We recommend that you implement the RegressionModel before
    working on this part of the project.)
    """
    def __init__(self):
        # Our dataset contains words from five different languages, and the
        # combined alphabets of the five languages contain a total of 47 unique
        # characters.
        # You can refer to self.num_chars or len(self.languages) in your code
        self.num_chars = 47
        self.languages = ["English", "Spanish", "Finnish", "Dutch", "Polish"]

        # Initialize your model parameters here
        "*** YOUR CODE HERE ***"

    def run(self, xs):
        """
        Runs the model for a batch of examples.

        Although words have different lengths, our data processing guarantees
        that within a single batch, all words will be of the same length (L).

        Here `xs` will be a list of length L. Each element of `xs` will be a
        node with shape (batch_size x self.num_chars), where every row in the
        array is a one-hot vector encoding of a character. For example, if we
        have a batch of 8 three-letter words where the last word is "cat", then
        xs[1] will be a node that contains a 1 at position (7, 0). Here the
        index 7 reflects the fact that "cat" is the last word in the batch, and
        the index 0 reflects the fact that the letter "a" is the inital (0th)
        letter of our combined alphabet for this task.

        Your model should use a Recurrent Neural Network to summarize the list
        `xs` into a single node of shape (batch_size x hidden_size), for your
        choice of hidden_size. It should then calculate a node of shape
        (batch_size x 5) containing scores, where higher scores correspond to
        greater probability of the word originating from a particular language.

        Inputs:
            xs: a list with L elements (one per character), where each element
                is a node with shape (batch_size x self.num_chars)
        Returns:
            A node with shape (batch_size x 5) containing predicted scores
                (also called logits)
        """
        "*** YOUR CODE HERE ***"

    def get_loss(self, xs, y):
        """
        Computes the loss for a batch of examples.

        The correct labels `y` are represented as a node with shape
        (batch_size x 5). Each row is a one-hot vector encoding the correct
        language.

        Inputs:
            xs: a list with L elements (one per character), where each element
                is a node with shape (batch_size x self.num_chars)
            y: a node with shape (batch_size x 5)
        Returns: a loss node
        """
        "*** YOUR CODE HERE ***"

    def train(self, dataset):
        """
        Trains the model.
        """
        "*** YOUR CODE HERE ***"
--- a/frontend/mnist/nn.py
+++ b/frontend/mnist/nn.py
@@ -0,0 +1,393 @@
 import numpy as np
 np.random.seed(42)
 def format_shape(shape):
    return "x".join(map(str, shape)) if shape else "()"

 class Node(object):
    def __repr__(self):
        return "<{} shape={} at {}>".format(
            type(self).__name__, format_shape(self.data.shape), hex(id(self)))

 class DataNode(Node):
    """
    DataNode is the parent class for Parameter and Constant nodes.

    You should not need to use this class directly.
    """
    def __init__(self, data):
        self.parents = []
        self.data = data

    def _forward(self, *inputs):
        return self.data

    @staticmethod
    def _backward(gradient, *inputs):
        return []

 class Parameter(DataNode):
    """
    A Parameter node stores parameters used in a neural network (or perceptron).

    Use the the `update` method to update parameters when training the
    perceptron or neural network.
    """
    def __init__(self, *shape):
        assert len(shape) == 2, (
            "Shape must have 2 dimensions, instead has {}".format(len(shape)))
        assert all(isinstance(dim, int) and dim > 0 for dim in shape), (
            "Shape must consist of positive integers, got {!r}".format(shape))
        limit = np.sqrt(3.0 / np.mean(shape))
        data = np.random.uniform(low=-limit, high=limit, size=shape)
        super().__init__(data)

    def update(self, direction, multiplier):
        assert isinstance(direction, Constant), (
            "Update direction must be a {} node, instead has type {!r}".format(
                Constant.__name__, type(direction).__name__))
        assert direction.data.shape == self.data.shape, (
            "Update direction shape {} does not match parameter shape "
            "{}".format(
                format_shape(direction.data.shape),
                format_shape(self.data.shape)))
        assert isinstance(multiplier, (int, float)), (
            "Multiplier must be a Python scalar, instead has type {!r}".format(
                type(multiplier).__name__))
        self.data += multiplier * direction.data
        assert np.all(np.isfinite(self.data)), (
            "Parameter contains NaN or infinity after update, cannot continue")

 class Constant(DataNode):
    """
    A Constant node is used to represent:
    * Input features
    * Output labels
    * Gradients computed by back-propagation

    You should not need to construct any Constant nodes directly; they will
    instead be provided by either the dataset or when you call `nn.gradients`.
    """
    def __init__(self, data):
        assert isinstance(data, np.ndarray), (
            "Data should be a numpy array, instead has type {!r}".format(
                type(data).__name__))
        assert np.issubdtype(data.dtype, np.floating), (
            "Data should be a float array, instead has data type {!r}".format(
                data.dtype))
        super().__init__(data)

 class FunctionNode(Node):
    """
    A FunctionNode represents a value that is computed based on other nodes.
    The FunctionNode class performs necessary book-keeping to compute gradients.
    """
    def __init__(self, *parents):
        assert all(isinstance(parent, Node) for parent in parents), (
            "Inputs must be node objects, instead got types {!r}".format(
                tuple(type(parent).__name__ for parent in parents)))
        self.parents = parents
        self.data = self._forward(*(parent.data for parent in parents))

 class Add(FunctionNode):
    """
    Adds matrices element-wise.

    Usage: nn.Add(x, y)
    Inputs:
        x: a Node with shape (batch_size x num_features)
        y: a Node with the same shape as x
    Output:
        a Node with shape (batch_size x num_features)
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[0].shape == inputs[1].shape, (
            "Input shapes should match, instead got {} and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        return inputs[0] + inputs[1]

    @staticmethod
    def _backward(gradient, *inputs):
        assert gradient.shape == inputs[0].shape
        return [gradient, gradient]

 class AddBias(FunctionNode):
    """
    Adds a bias vector to each feature vector

    Usage: nn.AddBias(features, bias)
    Inputs:
        features: a Node with shape (batch_size x num_features)
        bias: a Node with shape (1 x num_features)
    Output:
        a Node with shape (batch_size x num_features)
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[1].shape[0] == 1, (
            "First dimension of second input should be 1, instead got shape "
            "{}".format(format_shape(inputs[1].shape)))
        assert inputs[0].shape[1] == inputs[1].shape[1], (
            "Second dimension of inputs should match, instead got shapes {} "
            "and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        return inputs[0] + inputs[1]

    @staticmethod
    def _backward(gradient, *inputs):
        assert gradient.shape == inputs[0].shape
        return [gradient, np.sum(gradient, axis=0, keepdims=True)]

 class DotProduct(FunctionNode):
    """
    Batched dot product

    Usage: nn.DotProduct(features, weights)
    Inputs:
        features: a Node with shape (batch_size x num_features)
        weights: a Node with shape (1 x num_features)
    Output: a Node with shape (batch_size x 1)
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[1].shape[0] == 1, (
            "First dimension of second input should be 1, instead got shape "
            "{}".format(format_shape(inputs[1].shape)))
        assert inputs[0].shape[1] == inputs[1].shape[1], (
            "Second dimension of inputs should match, instead got shapes {} "
            "and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        return np.dot(inputs[0], inputs[1].T)

    @staticmethod
    def _backward(gradient, *inputs):
        # assert gradient.shape[0] == inputs[0].shape[0]
        # assert gradient.shape[1] == 1
        # return [np.dot(gradient, inputs[1]), np.dot(gradient.T, inputs[0])]
        raise NotImplementedError(
            "Backpropagation through DotProduct nodes is not needed in this "
            "assignment")

 class Linear(FunctionNode):
    """
    Applies a linear transformation (matrix multiplication) to the input

    Usage: nn.Linear(features, weights)
    Inputs:
        features: a Node with shape (batch_size x input_features)
        weights: a Node with shape (input_features x output_features)
    Output: a node with shape (batch_size x input_features)
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[0].shape[1] == inputs[1].shape[0], (
            "Second dimension of first input should match first dimension of "
            "second input, instead got shapes {} and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        return np.dot(inputs[0], inputs[1])

    @staticmethod
    def _backward(gradient, *inputs):
        assert gradient.shape[0] == inputs[0].shape[0]
        assert gradient.shape[1] == inputs[1].shape[1]
        return [np.dot(gradient, inputs[1].T), np.dot(inputs[0].T, gradient)]

 class ReLU(FunctionNode):
    """
    An element-wise Rectified Linear Unit nonlinearity: max(x, 0).
    This nonlinearity replaces all negative entries in its input with zeros.

    Usage: nn.ReLU(x)
    Input:
        x: a Node with shape (batch_size x num_features)
    Output: a Node with the same shape as x, but no negative entries
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 1, "Expected 1 input, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "Input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        return np.maximum(inputs[0], 0)

    @staticmethod
    def _backward(gradient, *inputs):
        assert gradient.shape == inputs[0].shape
        return [gradient * np.where(inputs[0] > 0, 1.0, 0.0)]

 class SquareLoss(FunctionNode):
    """
    This node first computes 0.5 * (a[i,j] - b[i,j])**2 at all positions (i,j)
    in the inputs, which creates a (batch_size x dim) matrix. It then calculates
    and returns the mean of all elements in this matrix.

    Usage: nn.SquareLoss(a, b)
    Inputs:
        a: a Node with shape (batch_size x dim)
        b: a Node with shape (batch_size x dim)
    Output: a scalar Node (containing a single floating-point number)
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[0].shape == inputs[1].shape, (
            "Input shapes should match, instead got {} and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        return np.mean(np.square(inputs[0] - inputs[1]) / 2)

    @staticmethod
    def _backward(gradient, *inputs):
        assert np.asarray(gradient).ndim == 0
        return [
            gradient * (inputs[0] - inputs[1]) / inputs[0].size,
            gradient * (inputs[1] - inputs[0]) / inputs[0].size
        ]

 class SoftmaxLoss(FunctionNode):
    """
    A batched softmax loss, used for classification problems.

    IMPORTANT: do not swap the order of the inputs to this node!

    Usage: nn.SoftmaxLoss(logits, labels)
    Inputs:
        logits: a Node with shape (batch_size x num_classes). Each row
            represents the scores associated with that example belonging to a
            particular class. A score can be an arbitrary real number.
        labels: a Node with shape (batch_size x num_classes) that encodes the
            correct labels for the examples. All entries must be non-negative
            and the sum of values along each row should be 1.
    Output: a scalar Node (containing a single floating-point number)
    """
    @staticmethod
    def log_softmax(logits):
        log_probs = logits - np.max(logits, axis=1, keepdims=True)
        log_probs -= np.log(np.sum(np.exp(log_probs), axis=1, keepdims=True))
        return log_probs

    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[0].shape == inputs[1].shape, (
            "Input shapes should match, instead got {} and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        assert np.all(inputs[1] >= 0), (
            "All entries in the labels input must be non-negative")
        assert np.allclose(np.sum(inputs[1], axis=1), 1), (
            "Labels input must sum to 1 along each row")
        log_probs = SoftmaxLoss.log_softmax(inputs[0])
        return np.mean(-np.sum(inputs[1] * log_probs, axis=1))

    @staticmethod
    def _backward(gradient, *inputs):
        assert np.asarray(gradient).ndim == 0
        log_probs = SoftmaxLoss.log_softmax(inputs[0])
        return [
            gradient * (np.exp(log_probs) - inputs[1]) / inputs[0].shape[0],
            gradient * -log_probs / inputs[0].shape[0]
        ]

 def gradients(loss, parameters):
    """
    Computes and returns the gradient of the loss with respect to the provided
    parameters.

    Usage: nn.gradients(loss, parameters)
    Inputs:
        loss: a SquareLoss or SoftmaxLoss node
        parameters: a list (or iterable) containing Parameter nodes
    Output: a list of Constant objects, representing the gradient of the loss
        with respect to each provided parameter.
    """

    assert isinstance(loss, (SquareLoss, SoftmaxLoss)), (
        "Loss must be a loss node, instead has type {!r}".format(
            type(loss).__name__))
    assert all(isinstance(parameter, Parameter) for parameter in parameters), (
        "Parameters must all have type {}, instead got types {!r}".format(
            Parameter.__name__,
            tuple(type(parameter).__name__ for parameter in parameters)))
    assert not hasattr(loss, "used"), (
        "Loss node has already been used for backpropagation, cannot reuse")

    loss.used = True

    nodes = set()
    tape = []

    def visit(node):
        if node not in nodes:
            for parent in node.parents:
                visit(parent)
            nodes.add(node)
            tape.append(node)

    visit(loss)
    nodes |= set(parameters)

    grads = {node: np.zeros_like(node.data) for node in nodes}
    grads[loss] = 1.0

    for node in reversed(tape):
        parent_grads = node._backward(
            grads[node], *(parent.data for parent in node.parents))
        for parent, parent_grad in zip(node.parents, parent_grads):
            grads[parent] += parent_grad

    return [Constant(grads[parameter]) for parameter in parameters]

 def as_scalar(node):
    """
    Returns the value of a Node as a standard Python number. This only works
    for nodes with one element (e.g. SquareLoss and SoftmaxLoss, as well as
    DotProduct with a batch size of 1 element).
    """

    assert isinstance(node, Node), (
        "Input must be a node object, instead has type {!r}".format(
            type(node).__name__))
    assert node.data.size == 1, (
        "Node has shape {}, cannot convert to a scalar".format(
            format_shape(node.data.shape)))
    node.data = node.data.flatten()
    return node.data.tolist()[0]
--- a/frontend/uct/data/mnist.npz
+++ b/frontend/uct/data/mnist.npz
--- a/frontend/uct/dataset.py
+++ b/frontend/uct/dataset.py
@@ -0,0 +1,36 @@
 import collections
 import os
 import time

 import matplotlib.pyplot as plt
 import numpy as np

 import uctc.nn as nn

 use_graphics = True

 def maybe_sleep_and_close(seconds):
    if use_graphics and plt.get_fignums():
        time.sleep(seconds)
        for fignum in plt.get_fignums():
            fig = plt.figure(fignum)
            plt.close(fig)
            try:
                # This raises a TclError on some Windows machines
                fig.canvas.start_event_loop(1e-3)
            except:
                pass

 def get_data_path(filename):
    path = os.path.join(
        os.path.dirname(__file__), os.pardir, "data", filename)
    if not os.path.exists(path):
        path = os.path.join(
            os.path.dirname(__file__), "data", filename)
    if not os.path.exists(path):
        path = os.path.join(
            os.path.dirname(__file__), filename)
    if not os.path.exists(path):
        raise Exception("Could not find data file: {}".format(filename))
    return path

--- a/frontend/uct/mnist.py
+++ b/frontend/uct/mnist.py
@@ -0,0 +1,232 @@
 import numpy as np
 import time
 import os
 import collections

 import matplotlib.pyplot as plt
 import uctc.nn as nn 
 from utils import parameter_data, Dataset

 use_graphics = False

 class DigitClassificationModel(object):
    """
    A model for handwritten digit classification using the MNIST dataset.

    Each handwritten digit is a 28x28 pixel grayscale image, which is flattened
    into a 784-dimensional vector for the purposes of this model. Each entry in
    the vector is a floating point number between 0 and 1.

    The goal is to sort each digit into one of 10 classes (number 0 through 9).

    (See RegressionModel for more information about the APIs of different
    methods here. We recommend that you implement the RegressionModel before
    working on this part of the project.)
    """
    def __init__(self):
        # Initialize your model parameters here
        "*** YOUR CODE HERE ***"
        self.input_features = 784
        self.h1 = 200
        self.h2 = 100
        self.output_features = 10
        self.lr = 0.01
        self.batch_size = 100
        self.w1 = nn.Parameter(parameter_data(self.input_features, self.h1))
        self.b1 = nn.Parameter(parameter_data(1, self.h1))
        self.w2 = nn.Parameter(parameter_data(self.h1, self.h2))
        self.b2 = nn.Parameter(parameter_data(1, self.h2))
        self.w3 = nn.Parameter(parameter_data(self.h2, self.output_features))
        self.b3 = nn.Parameter(parameter_data(1, self.output_features))


    def run(self, x):
        """
        Runs the model for a batch of examples.

        Your model should predict a node with shape (batch_size x 10),
        containing scores. Higher scores correspond to greater probability of
        the image belonging to a particular class.

        Inputs:
            x: a node with shape (batch_size x 784)
        Output:
            A node with shape (batch_size x 10) containing predicted scores
                (also called logits)
        """
        "*** YOUR CODE HERE ***"
        l1 = nn.ReLU(nn.AddBias(nn.Linear(x, self.w1), self.b1))
        l2 = nn.ReLU(nn.AddBias(nn.Linear(l1, self.w2), self.b2))
        l3 = nn.AddBias(nn.Linear(l2, self.w3), self.b3)
        return l3

    def get_loss(self, x, y):
        """
        Computes the loss for a batch of examples.

        The correct labels `y` are represented as a node with shape
        (batch_size x 10). Each row is a one-hot vector encoding the correct
        digit class (0-9).

        Inputs:
            x: a node with shape (batch_size x 784)
            y: a node with shape (batch_size x 10)
        Returns: a loss node
        """
        "*** YOUR CODE HERE ***"
        return nn.SoftmaxLoss(self.run(x), y)

    def train(self, dataset):
        """
        Trains the model.
        """
        "*** YOUR CODE HERE ***"
        while True:
            for x, y in dataset.iterate_once(self.batch_size):
                loss = self.get_loss(x, y)
                g_w1, g_b1, g_w2, g_b2, g_w3, g_b3 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2, self.w3, self.b3])
                self.w1.update(g_w1, self.lr)
                self.b1.update(g_b1, self.lr)
                self.w2.update(g_w2, self.lr)
                self.b2.update(g_b2, self.lr)
                self.w3.update(g_w3, self.lr)
                self.b3.update(g_b3, self.lr)
            accuracy = dataset.get_validation_accuracy()
            print(accuracy)
            if accuracy > 0.95:
                break

 def get_data_path(filename):
    path = os.path.join(
        os.path.dirname(__file__), os.pardir, "data", filename)
    if not os.path.exists(path):
        path = os.path.join(
            os.path.dirname(__file__), "data", filename)
    if not os.path.exists(path):
        path = os.path.join(
            os.path.dirname(__file__), filename)
    if not os.path.exists(path):
        raise Exception("Could not find data file: {}".format(filename))
    return path

 class DigitClassificationDataset(Dataset):
    def __init__(self, model: DigitClassificationModel):
        mnist_path = get_data_path("mnist.npz")

        with np.load(mnist_path) as data:
            train_images = data["train_images"]
            train_labels = data["train_labels"]
            test_images = data["test_images"]
            test_labels = data["test_labels"]
            assert len(train_images) == len(train_labels) == 60000
            assert len(test_images) == len(test_labels) == 10000
            self.dev_images = np.array(test_images[0::2], copy=True)
            self.dev_labels = np.array(test_labels[0::2], copy=True)
            self.test_images = np.array(test_images[1::2], copy=True)
            self.test_labels = np.array(test_labels[1::2], copy=True)

        train_labels_one_hot = np.zeros((len(train_images), 10))
        train_labels_one_hot[range(len(train_images)), train_labels] = 1

        super().__init__(train_images, train_labels_one_hot)

        self.model = model
        self.epoch = 0

        if use_graphics:
            width = 20  # Width of each row expressed as a multiple of image width
            samples = 100  # Number of images to display per label
            fig = plt.figure()
            ax = {}
            images = collections.defaultdict(list)
            texts = collections.defaultdict(list)
            for i in reversed(range(10)):
                ax[i] = plt.subplot2grid((30, 1), (3 * i, 0), 2, 1,
                                         sharex=ax.get(9))
                plt.setp(ax[i].get_xticklabels(), visible=i == 9)
                ax[i].set_yticks([])
                ax[i].text(-0.03, 0.5, i, transform=ax[i].transAxes,
                           va="center")
                ax[i].set_xlim(0, 28 * width)
                ax[i].set_ylim(0, 28)
                for j in range(samples):
                    images[i].append(ax[i].imshow(
                        np.zeros((28, 28)), vmin=0, vmax=1, cmap="Greens",
                        alpha=0.3))
                    texts[i].append(ax[i].text(
                        0, 0, "", ha="center", va="top", fontsize="smaller"))
            ax[9].set_xticks(np.linspace(0, 28 * width, 11))
            ax[9].set_xticklabels(
                ["{:.1f}".format(num) for num in np.linspace(0, 1, 11)])
            ax[9].tick_params(axis="x", pad=16)
            ax[9].set_xlabel("Probability of Correct Label")
            status = ax[0].text(
                0.5, 1.5, "", transform=ax[0].transAxes, ha="center",
                va="bottom")
            plt.show(block=False)

            self.width = width
            self.samples = samples
            self.fig = fig
            self.images = images
            self.texts = texts
            self.status = status
        self.last_update = time.time()

    def iterate_once(self, batch_size):
        self.epoch += 1

        for i, (x, y) in enumerate(super().iterate_once(batch_size)):
            yield x, y

            if time.time() - self.last_update > 1:
                dev_logits = self.model.run(nn.Constant(self.dev_images)).tensor()
                # dev_logits = np.array(dev_logits_raw.data()).reshape(5000, 10)
                # dev_predicted = np.argmax(dev_logits, axis=1)
                dev_argmax = nn.argmax(dev_logits, axis=1)
                dev_predicted = np.array(dev_argmax.data())
                # sftmax = np.array(nn.log_softmax(nn.pyarray_to_tensor(dev_logits)).data()).reshape(5000, 10)
                sftmax = nn.log_softmax(dev_logits)
                dev_probs = np.array(nn.exp(sftmax).data()).reshape(5000, 10)
                dev_accuracy = np.mean(dev_predicted == self.dev_labels)
                print("epoch: {:d}, batch: {:d}/{:d}, validation accuracy: "
                        "{:.2%}".format(
                            self.epoch, i, len(self.x) // batch_size, dev_accuracy))
                if use_graphics:
                    self.status.set_text(
                        "epoch: {:d}, batch: {:d}/{:d}, validation accuracy: "
                        "{:.2%}".format(
                            self.epoch, i, len(self.x) // batch_size, dev_accuracy))
                    for i in range(10):
                        predicted = dev_predicted[self.dev_labels == i]
                        probs = dev_probs[self.dev_labels == i][:, i]
                        linspace = np.linspace(
                            0, len(probs) - 1, self.samples).astype(int)
                        indices = probs.argsort()[linspace]
                        for j, (prob, image) in enumerate(zip(
                                probs[indices],
                                self.dev_images[self.dev_labels == i][indices])):
                            self.images[i][j].set_data(image.reshape((28, 28)))
                            left = prob * (self.width - 1) * 28
                            if predicted[indices[j]] == i:
                                self.images[i][j].set_cmap("Greens")
                                self.texts[i][j].set_text("")
                            else:
                                self.images[i][j].set_cmap("Reds")
                                self.texts[i][j].set_text(predicted[indices[j]])
                                self.texts[i][j].set_x(left + 14)
                            self.images[i][j].set_extent([left, left + 28, 0, 28])
                    self.fig.canvas.draw_idle()
                    self.fig.canvas.start_event_loop(1e-3)
                self.last_update = time.time()

    def get_validation_accuracy(self):
        # print(self.dev_images[:2].tolist())
        dev_logits = self.model.run(nn.Constant(self.dev_images)).tensor()
        dev_predicted = np.array(nn.argmax(dev_logits, axis=1).data())
        dev_accuracy = np.mean(dev_predicted == self.dev_labels)
        return dev_accuracy

 model = DigitClassificationModel()
 dataset = DigitClassificationDataset(model)
 model.train(dataset)
--- a/frontend/uct/perception.py
+++ b/frontend/uct/perception.py
@@ -0,0 +1,129 @@
 import numpy as np
 import time
 import os

 import matplotlib.pyplot as plt
 import uctc.nn as nn 
 from utils import parameter_data, Dataset

 use_graphics = False
 class PerceptronModel(object):
    def __init__(self, dimensions):
        """
        Initialize a new Perceptron instance.

        A perceptron classifies data points as either belonging to a particular
        class (+1) or not (-1). `dimensions` is the dimensionality of the data.
        For example, dimensions=2 would mean that the perceptron must classify
        2D points.
        """
        self.w = nn.Parameter(parameter_data(dimensions, 1))

    def get_weights(self):
        """
        Return a Parameter instance with the current weights of the perceptron.
        """
        return self.w.data()

    def run(self, x):
        """
        Calculates the score assigned by the perceptron to a data point x.

        Inputs:
            x: a node with shape (1 x dimensions)
        Returns: a node containing a single number (the score)
        """
        "*** YOUR CODE HERE ***"
        out = nn.Linear(x, self.w)
        return out

    def get_prediction(self, x):
        """
        Calculates the predicted class for a single data point `x`.

        Returns: 1 or -1
        """
        "*** YOUR CODE HERE ***"
        score = self.run(x).data()[0]
        # score = np.array(x.data()).dot(np.array(self.w.data()))
        if score >= 0:
            return 1
        else:
            return -1


    def train(self, dataset):
        """
        Train the perceptron until convergence.
        """
        "*** YOUR CODE HERE ***"
        batch_size = 1

        while True:
            converged = True
            for x, y in dataset.iterate_once(batch_size):
                prediction = self.get_prediction(x)
                x = np.array(x.data(), dtype=np.float32)
                y = int(y.data()[0])
                # assert 0
                if prediction != y:
                    # print(prediction, y)
                    converged = False
                    self.w.update(nn.pyarray_to_tensor(x), -y)
                # time.sleep(0.01)
            if converged:
                break

 class PerceptronDataset(Dataset):
    def __init__(self, model: PerceptronModel):
        points = 500
        x = np.hstack([np.random.randn(points, 2), np.ones((points, 1))])
        y = np.where(x[:, 0] + 2 * x[:, 1] - 1 >= 0, 1.0, -1.0)
        super().__init__(x, np.expand_dims(y, axis=1))

        self.model = model
        self.epoch = 0
        limits = np.array([-3.0, 3.0])
        if use_graphics:
            fig, ax = plt.subplots(1, 1)
            ax.set_xlim(limits)
            ax.set_ylim(limits)
            positive = ax.scatter(*x[y == 1, :-1].T, color="red", marker="+")
            negative = ax.scatter(*x[y == -1, :-1].T, color="blue", marker="_")
            line, = ax.plot([], [], color="black")
            text = ax.text(0.03, 0.97, "", transform=ax.transAxes, va="top")
            ax.legend([positive, negative], [1, -1])
            plt.show(block=False)

            self.fig = fig
            self.line = line
            self.text = text
        self.limits = limits
        self.last_update = time.time()

    def iterate_once(self, batch_size):
        self.epoch += 1

        for i, (x, y) in enumerate(super().iterate_once(batch_size)):
            yield x, y

            if time.time() - self.last_update > 0.001:
                w = self.model.get_weights()
                limits = self.limits
                print(f"epoch: {self.epoch}\npoint: {i * batch_size + 1}/{len(self.x)}\nweights: {w}")
                if use_graphics:
                    if w[1] != 0:
                        self.line.set_data(limits, (-w[0] * limits - w[2]) / w[1])
                    elif w[0] != 0:
                        self.line.set_data(np.full(2, -w[2] / w[0]), limits)
                    else:
                        self.line.set_data([], [])
                        self.text.set_text(
                                f"epoch: {self.epoch}\npoint: {i * batch_size + 1}/{len(self.x)}\nweights: {w}")
                        self.fig.canvas.draw_idle()
                        self.fig.canvas.start_event_loop(1e-3)
                self.last_update = time.time()

 model = PerceptronModel(3)
 dataset = PerceptronDataset(model)
 model.train(dataset)
--- a/frontend/uct/regression.py
+++ b/frontend/uct/regression.py
@@ -0,0 +1,141 @@
 import numpy as np
 np.random.seed(42)
 import time
 import os

 import matplotlib.pyplot as plt
 import uctc.nn as nn 
 from utils import parameter_data, Dataset

 use_graphics = False

 class RegressionModel(object):
    """
    A neural network model for approximating a function that maps from real
    numbers to real numbers. The network should be sufficiently large to be able
    to approximate sin(x) on the interval [-2pi, 2pi] to reasonable precision.
    """
    def __init__(self):
        # Initialize your model parameters here
        self.batch_size = 10
        self.input_features = 1
        self.output_features = 1
        self.hidden_f1 = 50
        self.lr = 0.01
        self.w1 = nn.Parameter(parameter_data(self.input_features, self.hidden_f1))
        self.b1 = nn.Parameter(parameter_data(1, self.hidden_f1))
        self.w2 = nn.Parameter(parameter_data(self.hidden_f1, self.output_features))
        self.b2 = nn.Parameter(parameter_data(1, self.output_features))

    def run(self, x):
        """
        Runs the model for a batch of examples.

        Inputs:
            x: a node with shape (batch_size x 1)
        Returns:
            A node with shape (batch_size x 1) containing predicted y-values
        """
        "*** YOUR CODE HERE ***"
        # uctc
        linear1 = nn.Linear(x, self.w1)
        bias1 = nn.AddBias(linear1, self.b1)
        act1 = nn.ReLU(bias1)
        linear2 = nn.Linear(act1, self.w2)
        bias2 = nn.AddBias(linear2, self.b2)

        # numpy
        # print(len(x.data()))
        _x = np.array(x.data()).reshape(-1, 1)
        _w1 = np.array(self.w1.data()).reshape(self.input_features, -1)
        _b1 = np.array(self.b1.data()).reshape(1, -1)
        _w2 = np.array(self.w2.data()).reshape(self.hidden_f1, -1)
        _b2 = np.array(self.b2.data()).reshape(1, -1)

        _linear1 = np.dot(_x, _w1) + _b1
        _act1 = np.maximum(0.0, _linear1)
        _linear2 = np.dot(_act1, _w2) + _b2
        
        return bias2

    def get_loss(self, x, y):
        """
        Computes the loss for a batch of examples.

        Inputs:
            x: a node with shape (batch_size x 1)
            y: a node with shape (batch_size x 1), containing the true y-values
                to be used for training
        Returns: a loss node
        """
        "*** YOUR CODE HERE ***"
        predict_y = self.run(x)
        return nn.SquareLoss(predict_y, y)

    def train(self, dataset):
        """
        Trains the model.
        """
        "*** YOUR CODE HERE ***"
        itera = 0
        while True:
            for x, y in dataset.iterate_once(self.batch_size):
                loss = self.get_loss(x, y)
                g_w1, g_b1, g_w2, g_b2 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
                self.w1.update(g_w1, self.lr)
                self.b1.update(g_b1, self.lr)
                self.w2.update(g_w2, self.lr)
                self.b2.update(g_b2, self.lr)
                itera += 1
            if loss.data()[0] < 0.01:
                break
            
    
 class RegressionDataset(Dataset):
    def __init__(self, model: RegressionModel):
        x = np.expand_dims(np.linspace(-2 * np.pi, 2 * np.pi, num=200), axis=1)
        np.random.RandomState(0).shuffle(x)
        self.argsort_x = np.argsort(x.flatten())
        y = np.sin(x)
        super().__init__(x, y)

        self.model = model
        self.processed = 0

        if use_graphics:
            fig, ax = plt.subplots(1, 1)
            ax.set_xlim(-2 * np.pi, 2 * np.pi)
            ax.set_ylim(-1.4, 1.4)
            real, = ax.plot(x[self.argsort_x], y[self.argsort_x], color="blue")
            learned, = ax.plot([], [], color="red")
            text = ax.text(0.03, 0.97, "", transform=ax.transAxes, va="top")
            ax.legend([real, learned], ["real", "learned"])
            plt.show(block=False)

            self.fig = fig
            self.learned = learned
            self.text = text
        self.last_update = time.time()

    def iterate_once(self, batch_size):
        for x, y in super().iterate_once(batch_size):
            yield x, y
            self.processed += batch_size

            if time.time() - self.last_update > 0.01:
                predicted = self.model.run(nn.Constant(self.x)).data()
                loss = self.model.get_loss(
                    x, y).data()
                predicted = np.array(predicted)
                loss = loss[0]
                print(f"processed: {self.processed}\nloss: {loss: .6f}")
                if use_graphics:
                    self.learned.set_data(self.x[self.argsort_x], predicted[self.argsort_x])
                    self.text.set_text(f"processed: {self.processed}\nloss: {loss: .6f}")
                    self.fig.canvas.draw_idle()
                    self.fig.canvas.start_event_loop(1e-3)
                self.last_update = time.time()

 model = RegressionModel()
 dataset = RegressionDataset(model)
 model.train(dataset)
--- a/frontend/uct/test/01_addbias_test.py
+++ b/frontend/uct/test/01_addbias_test.py
@@ -0,0 +1,72 @@
 import uctc.nn as nn
 import std_model as stdnn
 import numpy as np

 class LinearTestModel:
    def __init__(self, output_features):
        self.b1 = nn.Parameter([1, output_features])
    
    def forward(self, x):
        l2 = nn.AddBias(x, self.b1)
        return l2
    
    def get_loss(self, x, y):
        return nn.SquareLoss(self.forward(x), y)
    
    def backward(self, x, y):
        loss = self.get_loss(x, y)
        g_b1 = nn.gradients(loss, [self.b1])[0]
        return g_b1.data()

 class StdLinerTestModel:
    def __init__(self, output_features, tmodel: LinearTestModel):
        self.b1 = stdnn.Parameter(1, output_features)
        self.b1.data = np.array(tmodel.b1.data()).reshape(1, output_features)

    def forward(self, x):
        l2 = stdnn.AddBias(x, self.b1)
        return l2
    
    def get_loss(self, x, y):
        return stdnn.SquareLoss(self.forward(x), y)
    
    def backward(self, x, y):
        loss = self.get_loss(x, y)
        g_b1 = stdnn.gradients(loss, [self.b1])[0]
        return g_b1.data.flatten().tolist()

 output_features = 32
 batch_size = 4
 x = np.random.randn(batch_size, output_features).astype(np.float32)
 y = np.random.randn(batch_size, output_features).astype(np.float32)

 model = LinearTestModel(output_features)
 test_x = nn.Constant(x)
 predict_y = model.forward(test_x).data()
 test_y = nn.Constant(y)
 loss = model.get_loss(test_x, test_y).data()
 g_b1 = model.backward(test_x, test_y)

 stdmodel = StdLinerTestModel(output_features, model)
 std_test_x = stdnn.Constant(x)
 std_predict_y = stdmodel.forward(std_test_x)
 std_test_y = stdnn.Constant(y)
 std_loss = stdmodel.get_loss(std_test_x, std_test_y)
 std_g_b1 = stdmodel.backward(std_test_x, std_test_y)

 # check forward
 for x, y in zip(predict_y, std_predict_y.data.tolist()[0]):
    if (abs(x-y) > 1e-4):
        assert 0, "Forward data mismatch!"

 # check loss
 if abs(loss[0] - std_loss.data) > 1e-4:
    assert 0, "Loss mismatch!"

 # check backward
 for i, (x, y) in enumerate(zip(g_b1, std_g_b1)):
    if (abs(x-y) > 1e-4):
        assert 0, f"Gradient b1 mismatch at position {i}, g_b1 is {x} while std g_b1 is {y}"


 print("Test passed")
--- a/frontend/uct/test/02_linear_test.py
+++ b/frontend/uct/test/02_linear_test.py
@@ -0,0 +1,81 @@
 import uctc.nn as nn
 import std_model as stdnn
 import numpy as np

 class LinearTestModel:
    def __init__(self, input_features, output_features):
        self.w1 = nn.Parameter([input_features, output_features])
        self.b1 = nn.Parameter([1, output_features])
    
    def forward(self, x):
        l1 = nn.Linear(x, self.w1)
        l2 = nn.AddBias(l1, self.b1)
        return l2
    
    def get_loss(self, x, y):
        return nn.SquareLoss(self.forward(x), y)
    
    def backward(self, x, y):
        loss = self.get_loss(x, y)
        g_w1, g_b1 = nn.gradients(loss, [self.w1, self.b1])
        return g_w1.data(), g_b1.data()

 class StdLinerTestModel:
    def __init__(self, input_features, output_features, tmodel: LinearTestModel):
        self.w1 = stdnn.Parameter(input_features, output_features)
        self.b1 = stdnn.Parameter(1, output_features)
        self.w1.data = np.array(tmodel.w1.data()).reshape(input_features, output_features)
        self.b1.data = np.array(tmodel.b1.data()).reshape(1, output_features)

    def forward(self, x):
        l1 = stdnn.Linear(x, self.w1)
        l2 = stdnn.AddBias(l1, self.b1)
        return l2
    
    def get_loss(self, x, y):
        return stdnn.SquareLoss(self.forward(x), y)
    
    def backward(self, x, y):
        loss = self.get_loss(x, y)
        g_w1, g_b1 = stdnn.gradients(loss, [self.w1, self.b1])
        return g_w1.data.flatten().tolist(), g_b1.data.flatten().tolist()

 input_features = 16
 output_features = 32
 batch_size = 4
 x = np.random.randn(batch_size, input_features).astype(np.float32)
 y = np.random.randn(batch_size, output_features).astype(np.float32)

 model = LinearTestModel(input_features, output_features)
 test_x = nn.Constant(x)
 predict_y = model.forward(test_x).data()
 test_y = nn.Constant(y)
 loss = model.get_loss(test_x, test_y).data()
 g_w1, g_b1 = model.backward(test_x, test_y)

 stdmodel = StdLinerTestModel(input_features, output_features, model)
 std_test_x = stdnn.Constant(x)
 std_predict_y = stdmodel.forward(std_test_x)
 std_test_y = stdnn.Constant(y)
 std_loss = stdmodel.get_loss(std_test_x, std_test_y)
 std_g_w1, std_g_b1 = stdmodel.backward(std_test_x, std_test_y)

 # check forward
 for x, y in zip(predict_y, std_predict_y.data.tolist()[0]):
    if (abs(x-y) > 1e-4):
        assert 0, "Forward data mismatch!"

 # check loss
 if abs(loss[0] - std_loss.data) > 1e-4:
    assert 0, "Loss mismatch!"

 # check backward
 for i, (x, y) in enumerate(zip(g_w1, std_g_w1)):
    if (abs(x-y) > 1e-4):
        assert 0, f"Gradient w1 mismatch at position {i}, g_w1 is {x} while std g_w1 is {y}"
 for i, (x, y) in enumerate(zip(g_b1, std_g_b1)):
    if (abs(x-y) > 1e-4):
        assert 0, f"Gradient b1 mismatch at position {i}, g_b1 is {x} while std g_b1 is {y}"


 print("Test passed")
--- a/frontend/uct/test/03_relu_test.py
+++ b/frontend/uct/test/03_relu_test.py
@@ -0,0 +1,83 @@
 import uctc.nn as nn
 import std_model as stdnn
 import numpy as np

 class LinearTestModel:
    def __init__(self, input_features, output_features):
        self.w1 = nn.Parameter([input_features, output_features])
        self.b1 = nn.Parameter([1, output_features])
    
    def forward(self, x):
        l1 = nn.Linear(x, self.w1)
        l2 = nn.AddBias(l1, self.b1)
        l3 = nn.ReLU(l2)
        return l3
    
    def get_loss(self, x, y):
        return nn.SquareLoss(self.forward(x), y)
    
    def backward(self, x, y):
        loss = self.get_loss(x, y)
        g_w1, g_b1 = nn.gradients(loss, [self.w1, self.b1])
        return g_w1.data(), g_b1.data()

 class StdLinerTestModel:
    def __init__(self, input_features, output_features, tmodel: LinearTestModel):
        self.w1 = stdnn.Parameter(input_features, output_features)
        self.b1 = stdnn.Parameter(1, output_features)
        self.w1.data = np.array(tmodel.w1.data()).reshape(input_features, output_features)
        self.b1.data = np.array(tmodel.b1.data()).reshape(1, output_features)

    def forward(self, x):
        l1 = stdnn.Linear(x, self.w1)
        l2 = stdnn.AddBias(l1, self.b1)
        l3 = stdnn.ReLU(l2)
        return l3
    
    def get_loss(self, x, y):
        return stdnn.SquareLoss(self.forward(x), y)
    
    def backward(self, x, y):
        loss = self.get_loss(x, y)
        g_w1, g_b1 = stdnn.gradients(loss, [self.w1, self.b1])
        return g_w1.data.flatten().tolist(), g_b1.data.flatten().tolist()

 input_features = 16
 output_features = 32
 batch_size = 4
 x = np.random.randn(batch_size, input_features).astype(np.float32)
 y = np.random.randn(batch_size, output_features).astype(np.float32)

 model = LinearTestModel(input_features, output_features)
 test_x = nn.Constant(x)
 predict_y = model.forward(test_x).data()
 test_y = nn.Constant(y)
 loss = model.get_loss(test_x, test_y).data()
 g_w1, g_b1 = model.backward(test_x, test_y)

 stdmodel = StdLinerTestModel(input_features, output_features, model)
 std_test_x = stdnn.Constant(x)
 std_predict_y = stdmodel.forward(std_test_x)
 std_test_y = stdnn.Constant(y)
 std_loss = stdmodel.get_loss(std_test_x, std_test_y)
 std_g_w1, std_g_b1 = stdmodel.backward(std_test_x, std_test_y)

 # check forward
 for x, y in zip(predict_y, std_predict_y.data.tolist()[0]):
    if (abs(x-y) > 1e-4):
        assert 0, "Forward data mismatch!"

 # check loss
 if abs(loss[0] - std_loss.data) > 1e-4:
    assert 0, "Loss mismatch!"

 # check backward
 for i, (x, y) in enumerate(zip(g_w1, std_g_w1)):
    if (abs(x-y) > 1e-4):
        assert 0, f"Gradient w1 mismatch at position {i}, g_w1 is {x} while std g_w1 is {y}"
 for i, (x, y) in enumerate(zip(g_b1, std_g_b1)):
    if (abs(x-y) > 1e-4):
        assert 0, f"Gradient b1 mismatch at position {i}, g_b1 is {x} while std g_b1 is {y}"


 print("Test passed")
--- a/frontend/uct/test/04_2layers_test.py
+++ b/frontend/uct/test/04_2layers_test.py
@@ -0,0 +1,144 @@
 import uctc.nn as nn
 import std_model as stdnn
 import numpy as np
 np.random.seed(42)
 class LinearTestModel:
    def __init__(self, input_features, hidden_features, output_features):
        self.w1 = nn.Parameter([input_features, hidden_features])
        self.b1 = nn.Parameter([1, hidden_features])
        self.w2 = nn.Parameter([hidden_features, output_features])
        self.b2 = nn.Parameter([1, output_features])
    
    def forward(self, x):
        l1 = nn.Linear(x, self.w1)
        l2 = nn.AddBias(l1, self.b1)
        l3 = nn.ReLU(l2)
        l4 = nn.Linear(l3, self.w2)
        l5 = nn.AddBias(l4, self.b2)
        return l5
    
    def get_loss(self, x, y):
        return nn.SquareLoss(self.forward(x), y)
    
    def backward(self, x, y):
        loss = self.get_loss(x, y)
        g_w1, g_b1, g_w2, g_b2 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
        return g_w1.data(), g_b1.data(), g_w2.data(), g_b2.data()
    
    def update(self, x, y, lr):
        loss = self.get_loss(x, y)
        g_w1, g_b1, g_w2, g_b2 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
        self.w1.update(g_w1, lr)
        self.b1.update(g_b1, lr)
        self.w2.update(g_w2, lr)
        self.b2.update(g_b2, lr)
        print(g_w1.data())
        print(g_b1.data())
        print(g_w2.data())
        print(g_b2.data())
        return self.w1.data(), self.b1.data(), self.w2.data(), self.b2.data()


 class StdLinerTestModel:
    def __init__(self, input_features, hidden_features, output_features, tmodel: LinearTestModel):
        self.w1 = stdnn.Parameter(input_features, hidden_features)
        self.b1 = stdnn.Parameter(1, hidden_features)
        self.w2 = stdnn.Parameter(hidden_features, output_features)
        self.b2 = stdnn.Parameter(1, output_features)
        self.w1.data = np.array(tmodel.w1.data()).reshape(input_features, hidden_features)
        self.b1.data = np.array(tmodel.b1.data()).reshape(1, hidden_features)
        self.w2.data = np.array(tmodel.w2.data()).reshape(hidden_features, output_features)
        self.b2.data = np.array(tmodel.b2.data()).reshape(1, output_features)
        

    def forward(self, x):
        l1 = stdnn.Linear(x, self.w1)
        l2 = stdnn.AddBias(l1, self.b1)
        l3 = stdnn.ReLU(l2)
        l4 = stdnn.Linear(l3, self.w2)
        l5 = stdnn.AddBias(l4, self.b2)
        return l5
    
    def get_loss(self, x, y):
        return stdnn.SquareLoss(self.forward(x), y)
    
    def backward(self, x, y):
        loss = self.get_loss(x, y)
        g_w1, g_b1, g_w2, g_b2 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
        return g_w1.data.flatten().tolist(), g_b1.data.flatten().tolist(), g_w2.data.flatten().tolist(), g_b2.data.flatten().tolist()
    
    def update(self, x, y, lr):
        loss = self.get_loss(x, y)
        g_w1, g_b1, g_w2, g_b2 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
        self.w1.update(g_w1, -lr)
        self.b1.update(g_b1, -lr)
        self.w2.update(g_w2, -lr)
        self.b2.update(g_b2, -lr)
        return self.w1.data.flatten().tolist(), self.b1.data.flatten().tolist(), self.w2.data.flatten().tolist(), self.b2.data.flatten().tolist()

 input_features = 1
 hidden_features = 50
 output_features = 1
 batch_size = 10
 x = np.array([-5.146528720855713, 4.451905250549316, 0.4736069440841675, -0.09472138434648514, 4.8939385414123535, 5.209676265716553, -5.967447280883789, 2.9363629817962646, -5.525413990020752, 3.315248489379883]).reshape(batch_size, -1)
 y = np.array([0.9072322249412537, -0.9662654995918274, 0.45609915256500244, -0.09457980841398239, -0.9835651516914368, -0.8788799047470093, 0.3105180263519287, 0.2037920206785202, 0.6873041391372681, -0.17278438806533813]).reshape(batch_size, -1)

 model = LinearTestModel(input_features, hidden_features, output_features)
 stdmodel = StdLinerTestModel(input_features, hidden_features, output_features, model)

 test_x = nn.Constant(x)
 predict_y = model.forward(test_x).data()
 test_y = nn.Constant(y)
 loss = model.get_loss(test_x, test_y).data()
 g_w1, g_b1, g_w2, g_b2 = model.backward(test_x, test_y)
 new_w1, new_b1, new_w2, new_b2 = model.update(test_x, test_y, 0)


 std_test_x = stdnn.Constant(x)
 std_predict_y = stdmodel.forward(std_test_x)
 std_test_y = stdnn.Constant(y)
 std_loss = stdmodel.get_loss(std_test_x, std_test_y)
 std_g_w1, std_g_b1, std_g_w2, std_g_b2 = stdmodel.backward(std_test_x, std_test_y)
 std_new_w1, std_new_b1, std_new_w2, std_new_b2 = stdmodel.update(std_test_x, std_test_y, 0)

 # print(predict_y)
 # print()
 # print(std_predict_y.data.flatten().tolist())
 # check forward
 for x, y in zip(predict_y, std_predict_y.data.flatten().tolist()):
    if (abs(x-y) > 1e-4):
        assert 0, "Forward data mismatch!"

 # print(loss, std_loss.data)
 # check loss
 if abs(loss[0] - std_loss.data) > 1e-4:
    assert 0, "Loss mismatch!"

 # check backward
 for i, (x, y) in enumerate(zip(g_w1, std_g_w1)):
    if (abs(x-y) > 1e-4):
        assert 0, f"Gradient w1 mismatch at position {i}, g_w1 is {x} while std g_w1 is {y}"
 for i, (x, y) in enumerate(zip(g_b1, std_g_b1)):
    if (abs(x-y) > 1e-4):
        assert 0, f"Gradient b1 mismatch at position {i}, g_b1 is {x} while std g_b1 is {y}"
 for i, (x, y) in enumerate(zip(g_w2, std_g_w2)):
    if (abs(x-y) > 1e-4):
        assert 0, f"Gradient w2 mismatch at position {i}, g_w2 is {x} while std g_w2 is {y}"
 for i, (x, y) in enumerate(zip(g_b2, std_g_b2)):
    if (abs(x-y) > 1e-4):
        assert 0, f"Gradient b2 mismatch at position {i}, g_b2 is {x} while std g_b2 is {y}"

 # check update
 for i, (x, y) in enumerate(zip(new_b1, std_new_b1)):
    if (abs(x-y) > 1e-4):
        assert 0, f"Updated b1 mismatch at position {i}, new_b1 is {x} while std new_b1 is {y}"
 for i, (x, y) in enumerate(zip(new_w1, std_new_w1)):
    if (abs(x-y) > 1e-4):
        assert 0, f"Updated w1 mismatch at position {i}, new_w1 is {x} while std new_w1 is {y}"
 # for i, (x, y) in enumerate(zip(new_b2, std_new_b2)):
 #     if (abs(x-y) > 1e-4):
 #         assert 0, f"Updated b2 mismatch at position {i}, new_b2 is {x} while std new_b2 is {y}"
 # for i, (x, y) in enumerate(zip(new_w2, std_new_w2)):
 #     if (abs(x-y) > 1e-4):
 #         assert 0, f"Updated w2 mismatch at position {i}, new_w2 is {x} while std new_w2 is {y}"
 print("Test passed")
--- a/frontend/uct/test/05_training_test.py
+++ b/frontend/uct/test/05_training_test.py
@@ -0,0 +1,128 @@
 import uctc.nn as nn
 import std_model as stdnn
 import numpy as np
 np.random.seed(42)
 class LinearTestModel:
    def __init__(self, input_features, hidden_features, output_features):
        self.w1 = nn.Parameter([input_features, hidden_features])
        self.b1 = nn.Parameter([1, hidden_features])
        self.w2 = nn.Parameter([hidden_features, output_features])
        self.b2 = nn.Parameter([1, output_features])
    
    def forward(self, x):
        layer_1 = nn.ReLU(nn.AddBias(nn.Linear(x, self.w1), self.b1))
        prediction = nn.AddBias(nn.Linear(layer_1, self.w2), self.b2)
        # print(f"o1: {prediction.data()[:10]}")
        return prediction
    
    def get_loss(self, x, y):
        return nn.SquareLoss(self.forward(x), y)
    
    def backward(self, x, y):
        loss = self.get_loss(x, y)
        g_w1, g_b1, g_w2, g_b2 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
        return g_w1.data(), g_b1.data(), g_w2.data(), g_b2.data()
    
    def update(self, x, y, lr):
        loss = self.get_loss(x, y)
        g_w1, g_b1, g_w2, g_b2 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
        self.w1.update(g_w1, lr)
        self.b1.update(g_b1, lr)
        self.w2.update(g_w2, lr)
        self.b2.update(g_b2, lr)
        # print(g_w1.data())
        # print(g_b1.data())
        # print(g_w2.data())
        # print(g_b2.data())
        # return self.w1.data(), self.b1.data(), self.w2.data(), self.b2.data()
    
    def train(self):
        self.x = np.expand_dims(np.linspace(-2 * np.pi, 2 * np.pi, num=200), axis=1)
        # np.random.RandomState(0).shuffle(self.x)
        self.argsort_x = np.argsort(self.x.flatten())
        self.y = np.sin(self.x)
        for i in range(epoch):
            np.random.RandomState(0).shuffle(self.x)
            index = 0
            while index < self.x.shape[0]:
                x = self.x[index:index + batch_size]
                y = self.y[index:index + batch_size]
                cx = nn.Constant(x)
                cy = nn.Constant(y)
                self.update(cx, cy, 0.01)
                index += batch_size
                # break
            loss = self.get_loss(cx,cy)
            print(loss.data())


 class StdLinerTestModel:
    def __init__(self, input_features, hidden_features, output_features, tmodel: LinearTestModel):
        self.w1 = stdnn.Parameter(input_features, hidden_features)
        self.b1 = stdnn.Parameter(1, hidden_features)
        self.w2 = stdnn.Parameter(hidden_features, output_features)
        self.b2 = stdnn.Parameter(1, output_features)
        # self.w1.data = np.array(tmodel.w1.data()).reshape(input_features, hidden_features)
        # self.b1.data = np.array(tmodel.b1.data()).reshape(1, hidden_features)
        # self.w2.data = np.array(tmodel.w2.data()).reshape(hidden_features, output_features)
        # self.b2.data = np.array(tmodel.b2.data()).reshape(1, output_features)
        # print(self.w1.data)
        

    def forward(self, x):
        layer_1 = stdnn.ReLU(stdnn.AddBias(stdnn.Linear(x, self.w1), self.b1))
        prediction = stdnn.AddBias(stdnn.Linear(layer_1, self.w2), self.b2)
        # print(f"o2: {prediction.data.flatten()[:10]}")
        return prediction
    
    def get_loss(self, x, y):
        return stdnn.SquareLoss(self.forward(x), y)
    
    def backward(self, x, y):
        loss = self.get_loss(x, y)
        g_w1, g_b1, g_w2, g_b2 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
        return g_w1.data.flatten().tolist(), g_b1.data.flatten().tolist(), g_w2.data.flatten().tolist(), g_b2.data.flatten().tolist()
    
    def update(self, x, y, lr):
        # loss = self.get_loss(x, y)
        # g_w1, g_b1, g_w2, g_b2 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
        loss = self.get_loss(x, y)
        g_w1, g_b1, g_w2, g_b2 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
        self.w1.update(g_w1, -lr)
        self.b1.update(g_b1, -lr)
        self.w2.update(g_w2, -lr)
        self.b2.update(g_b2, -lr)
        # print(loss.data)
        # return self.w1.data.flatten().tolist(), self.b1.data.flatten().tolist(), self.w2.data.flatten().tolist(), self.b2.data.flatten().tolist()
    
    def train(self):
        self.x = np.expand_dims(np.linspace(-2 * np.pi, 2 * np.pi, num=200), axis=1)
        self.argsort_x = np.argsort(self.x.flatten())
        self.y = np.sin(self.x)
        for i in range(epoch):
            # np.random.RandomState(0).shuffle(self.x)
            index = 0
            while index < self.x.shape[0]:
                x = self.x[index:index + batch_size]
                y = self.y[index:index + batch_size]
                cx = stdnn.Constant(x)
                cy = stdnn.Constant(y)
                self.update(cx, cy, 0.01)
                index += batch_size
                break
            loss = self.get_loss(cx, cy)
            print(loss.data)

 input_features = 1
 hidden_features = 50
 output_features = 1
 batch_size = 10
 epoch = 1

 model = LinearTestModel(input_features, hidden_features, output_features)
 smodel = StdLinerTestModel(input_features, hidden_features, output_features, model)

 # model.train()


 smodel.train()
--- a/frontend/uct/test/06_mnist_test.py
+++ b/frontend/uct/test/06_mnist_test.py
@@ -0,0 +1,144 @@
 import uctc.nn as nn
 import std_model as stdnn
 import numpy as np
 from data6 import x, y
 np.random.seed(42)

 def parameter_data(*shape):
    assert len(shape) == 2, (
            "Shape must have 2 dimensions, instead has {}".format(len(shape)))
    assert all(isinstance(dim, int) and dim > 0 for dim in shape), (
            "Shape must consist of positive integers, got {!r}".format(shape))
    limit = np.sqrt(3.0 / np.mean(shape))
    data = np.random.uniform(low=-limit, high=limit, size=shape).astype(np.float32)
    return data


 class MNISTModel:
    def __init__(self):
        self.input_features = 784
        self.h1 = 200
        self.h2 = 100
        self.output_features = 10
        self.lr = 0.01
        self.batch_size = 100
        self.w1data = parameter_data(self.input_features, self.h1)
        self.b1data = parameter_data(1, self.h1)
        self.w2data = parameter_data(self.h1, self.h2)
        self.b2data = parameter_data(1, self.h2)
        self.w3data = parameter_data(self.h2, self.output_features)
        self.b3data = parameter_data(1, self.output_features)
        self.w1 = nn.Parameter(self.w1data)
        self.b1 = nn.Parameter(self.b1data)
        self.w2 = nn.Parameter(self.w2data)
        self.b2 = nn.Parameter(self.b2data)
        self.w3 = nn.Parameter(self.w3data)
        self.b3 = nn.Parameter(self.b3data)
    
    def run(self, x):
        l1 = nn.ReLU(nn.AddBias(nn.Linear(x, self.w1), self.b1))
        l2 = nn.ReLU(nn.AddBias(nn.Linear(l1, self.w2), self.b2))
        l3 = nn.AddBias(nn.Linear(l2, self.w3), self.b3)
        return l3

    def get_loss(self, x, y):
        return nn.SoftmaxLoss(self.run(x), y)

    def train(self, x, y):
        loss = self.get_loss(x, y)
        g_w1, g_b1, g_w2, g_b2, g_w3, g_b3 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2, self.w3, self.b3])
        self.w1.update(g_w1, self.lr)
        self.b1.update(g_b1, self.lr)
        self.w2.update(g_w2, self.lr)
        self.b2.update(g_b2, self.lr)
        self.w3.update(g_w3, self.lr)
        self.b3.update(g_b3, self.lr)
        return g_w1.data(), g_b1.data(), g_w2.data(), g_b2.data(), g_w3.data(), g_b3.data()

 class StdMNISTModel:
    def __init__(self, model: MNISTModel):
        self.input_features = 784
        self.h1 = 200
        self.h2 = 100
        self.output_features = 10
        self.lr = 0.01
        self.batch_size = 100
        self.w1 = stdnn.Parameter(self.input_features, self.h1)
        self.w1.data = model.w1data
        self.b1 = stdnn.Parameter(1, self.h1)
        self.b1.data = model.b1data
        self.w2 = stdnn.Parameter(self.h1, self.h2)
        self.w2.data = model.w2data
        self.b2 = stdnn.Parameter(1, self.h2)
        self.b2.data = model.b2data
        self.w3 = stdnn.Parameter(self.h2, self.output_features)
        self.w3.data = model.w3data
        self.b3 = stdnn.Parameter(1, self.output_features)
        self.b3.data = model.b3data
        
    
    def run(self, x):
        l1 = stdnn.ReLU(stdnn.AddBias(stdnn.Linear(x, self.w1), self.b1))
        l2 = stdnn.ReLU(stdnn.AddBias(stdnn.Linear(l1, self.w2), self.b2))
        l3 = stdnn.AddBias(stdnn.Linear(l2, self.w3), self.b3)
        return l3

    def get_loss(self, x, y):
        return stdnn.SoftmaxLoss(self.run(x), y)

    def train(self, x, y):
        loss = self.get_loss(x, y)
        g_w1, g_b1, g_w2, g_b2, g_w3, g_b3 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2, self.w3, self.b3])
        self.w1.update(g_w1, -self.lr)
        self.b1.update(g_b1, -self.lr)
        self.w2.update(g_w2, -self.lr)
        self.b2.update(g_b2, -self.lr)
        self.w3.update(g_w3, -self.lr)
        self.b3.update(g_b3, -self.lr)
        return g_w1.data.flatten().tolist(), g_b1.data.flatten().tolist(), g_w2.data.flatten().tolist(), g_b2.data.flatten().tolist(), g_w3.data.flatten().tolist(), g_b3.data.flatten().tolist()

 model = MNISTModel()
 smodel = StdMNISTModel(model)

 o1_x = nn.Constant(x)
 o1_y = nn.Constant(y)
 o1_out = model.run(o1_x).data()
 print(o1_out)
 # o1_loss = model.get_loss(o1_x, o1_y)
 # print(o1_loss.data()[0])
 # o1_gw1, o1_gb1, o1_gw2, o1_gb2, o1_gw3, o1_gb3 = model.train(o1_x, o1_y)

 o2_x = stdnn.Constant(x)
 o2_y = stdnn.Constant(y)
 o2_out = smodel.run(o2_x).data
 print(o2_out)
 # o2_loss = smodel.get_loss(o2_x, o2_y)
 # print(o2_loss.data)
 # o2_gw1, o2_gb1, o2_gw2, o2_gb2, o2_gw3, o2_gb3 = smodel.train(o2_x, o2_y)

 # for i, (a, b) in enumerate(zip(o1_gw1, o2_gw1)):
 #     if abs(a - b) > 1e-4:
 #         print(f"gw1 failed: {i, a, b}")
 #         break
 # for i, (a, b) in enumerate(zip(o1_gb1, o2_gb1)):
 #     if abs(a - b) > 1e-4:
 #         print(f"gb1 failed: {i, a, b}")
 #         break  
 # for i, (a, b) in enumerate(zip(o1_gw2, o2_gw2)):
 #     if abs(a - b) > 1e-4:
 #         print(f"gw2 failed: {i, a, b}")
 #         break  
 # for i, (a, b) in enumerate(zip(o1_gb2, o2_gb2)):
 #     if abs(a - b) > 1e-4:
 #         print(f"gb2 failed: {i, a, b}")
 #         break 
 # for i, (a, b) in enumerate(zip(o1_gw3, o2_gw3)):
 #     if abs(a - b) > 1e-4:
 #         print(f"gw3 failed: {i, a, b}")
 #         break 
 # for i, (a, b) in enumerate(zip(o1_gb3, o2_gb3)):
 #     if abs(a - b) > 1e-4:
 #         print(f"gb3 failed: {i, a, b}")
 #         break
 # print(o1_loss.data()[0], o2_loss.data)
 print("PASSED")
--- a/frontend/uct/test/data6.py
+++ b/frontend/uct/test/data6.py
--- a/frontend/uct/test/std_model.py
+++ b/frontend/uct/test/std_model.py
@@ -0,0 +1,393 @@
 import numpy as np

 def format_shape(shape):
    return "x".join(map(str, shape)) if shape else "()"

 class Node(object):
    def __repr__(self):
        return "<{} shape={} at {}>".format(
            type(self).__name__, format_shape(self.data.shape), hex(id(self)))

 class DataNode(Node):
    """
    DataNode is the parent class for Parameter and Constant nodes.

    You should not need to use this class directly.
    """
    def __init__(self, data):
        self.parents = []
        self.data = data

    def _forward(self, *inputs):
        return self.data

    @staticmethod
    def _backward(gradient, *inputs):
        return []

 class Parameter(DataNode):
    """
    A Parameter node stores parameters used in a neural network (or perceptron).

    Use the the `update` method to update parameters when training the
    perceptron or neural network.
    """
    def __init__(self, *shape):
        assert len(shape) == 2, (
            "Shape must have 2 dimensions, instead has {}".format(len(shape)))
        assert all(isinstance(dim, int) and dim > 0 for dim in shape), (
            "Shape must consist of positive integers, got {!r}".format(shape))
        limit = np.sqrt(3.0 / np.mean(shape))
        data = np.random.uniform(low=-limit, high=limit, size=shape)
        super().__init__(data)

    def update(self, direction, multiplier):
        assert isinstance(direction, Constant), (
            "Update direction must be a {} node, instead has type {!r}".format(
                Constant.__name__, type(direction).__name__))
        assert direction.data.shape == self.data.shape, (
            "Update direction shape {} does not match parameter shape "
            "{}".format(
                format_shape(direction.data.shape),
                format_shape(self.data.shape)))
        assert isinstance(multiplier, (int, float)), (
            "Multiplier must be a Python scalar, instead has type {!r}".format(
                type(multiplier).__name__))
        self.data += multiplier * direction.data
        assert np.all(np.isfinite(self.data)), (
            "Parameter contains NaN or infinity after update, cannot continue")

 class Constant(DataNode):
    """
    A Constant node is used to represent:
    * Input features
    * Output labels
    * Gradients computed by back-propagation

    You should not need to construct any Constant nodes directly; they will
    instead be provided by either the dataset or when you call `nn.gradients`.
    """
    def __init__(self, data):
        assert isinstance(data, np.ndarray), (
            "Data should be a numpy array, instead has type {!r}".format(
                type(data).__name__))
        assert np.issubdtype(data.dtype, np.floating), (
            "Data should be a float array, instead has data type {!r}".format(
                data.dtype))
        super().__init__(data)

 class FunctionNode(Node):
    """
    A FunctionNode represents a value that is computed based on other nodes.
    The FunctionNode class performs necessary book-keeping to compute gradients.
    """
    def __init__(self, *parents):
        assert all(isinstance(parent, Node) for parent in parents), (
            "Inputs must be node objects, instead got types {!r}".format(
                tuple(type(parent).__name__ for parent in parents)))
        self.parents = parents
        self.data = self._forward(*(parent.data for parent in parents))

 class Add(FunctionNode):
    """
    Adds matrices element-wise.

    Usage: nn.Add(x, y)
    Inputs:
        x: a Node with shape (batch_size x num_features)
        y: a Node with the same shape as x
    Output:
        a Node with shape (batch_size x num_features)
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[0].shape == inputs[1].shape, (
            "Input shapes should match, instead got {} and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        return inputs[0] + inputs[1]

    @staticmethod
    def _backward(gradient, *inputs):
        assert gradient.shape == inputs[0].shape
        return [gradient, gradient]

 class AddBias(FunctionNode):
    """
    Adds a bias vector to each feature vector

    Usage: nn.AddBias(features, bias)
    Inputs:
        features: a Node with shape (batch_size x num_features)
        bias: a Node with shape (1 x num_features)
    Output:
        a Node with shape (batch_size x num_features)
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[1].shape[0] == 1, (
            "First dimension of second input should be 1, instead got shape "
            "{}".format(format_shape(inputs[1].shape)))
        assert inputs[0].shape[1] == inputs[1].shape[1], (
            "Second dimension of inputs should match, instead got shapes {} "
            "and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        return inputs[0] + inputs[1]

    @staticmethod
    def _backward(gradient, *inputs):
        assert gradient.shape == inputs[0].shape
        return [gradient, np.sum(gradient, axis=0, keepdims=True)]

 class DotProduct(FunctionNode):
    """
    Batched dot product

    Usage: nn.DotProduct(features, weights)
    Inputs:
        features: a Node with shape (batch_size x num_features)
        weights: a Node with shape (1 x num_features)
    Output: a Node with shape (batch_size x 1)
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[1].shape[0] == 1, (
            "First dimension of second input should be 1, instead got shape "
            "{}".format(format_shape(inputs[1].shape)))
        assert inputs[0].shape[1] == inputs[1].shape[1], (
            "Second dimension of inputs should match, instead got shapes {} "
            "and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        return np.dot(inputs[0], inputs[1].T)

    @staticmethod
    def _backward(gradient, *inputs):
        # assert gradient.shape[0] == inputs[0].shape[0]
        # assert gradient.shape[1] == 1
        # return [np.dot(gradient, inputs[1]), np.dot(gradient.T, inputs[0])]
        raise NotImplementedError(
            "Backpropagation through DotProduct nodes is not needed in this "
            "assignment")

 class Linear(FunctionNode):
    """
    Applies a linear transformation (matrix multiplication) to the input

    Usage: nn.Linear(features, weights)
    Inputs:
        features: a Node with shape (batch_size x input_features)
        weights: a Node with shape (input_features x output_features)
    Output: a node with shape (batch_size x input_features)
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[0].shape[1] == inputs[1].shape[0], (
            "Second dimension of first input should match first dimension of "
            "second input, instead got shapes {} and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        return np.dot(inputs[0], inputs[1])

    @staticmethod
    def _backward(gradient, *inputs):
        assert gradient.shape[0] == inputs[0].shape[0]
        assert gradient.shape[1] == inputs[1].shape[1]
        return [np.dot(gradient, inputs[1].T), np.dot(inputs[0].T, gradient)]

 class ReLU(FunctionNode):
    """
    An element-wise Rectified Linear Unit nonlinearity: max(x, 0).
    This nonlinearity replaces all negative entries in its input with zeros.

    Usage: nn.ReLU(x)
    Input:
        x: a Node with shape (batch_size x num_features)
    Output: a Node with the same shape as x, but no negative entries
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 1, "Expected 1 input, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "Input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        return np.maximum(inputs[0], 0)

    @staticmethod
    def _backward(gradient, *inputs):
        assert gradient.shape == inputs[0].shape
        return [gradient * np.where(inputs[0] > 0, 1.0, 0.0)]

 class SquareLoss(FunctionNode):
    """
    This node first computes 0.5 * (a[i,j] - b[i,j])**2 at all positions (i,j)
    in the inputs, which creates a (batch_size x dim) matrix. It then calculates
    and returns the mean of all elements in this matrix.

    Usage: nn.SquareLoss(a, b)
    Inputs:
        a: a Node with shape (batch_size x dim)
        b: a Node with shape (batch_size x dim)
    Output: a scalar Node (containing a single floating-point number)
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[0].shape == inputs[1].shape, (
            "Input shapes should match, instead got {} and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        return np.mean(np.square(inputs[0] - inputs[1]) / 2)

    @staticmethod
    def _backward(gradient, *inputs):
        assert np.asarray(gradient).ndim == 0
        return [
            gradient * (inputs[0] - inputs[1]) / inputs[0].size,
            gradient * (inputs[1] - inputs[0]) / inputs[0].size
        ]

 class SoftmaxLoss(FunctionNode):
    """
    A batched softmax loss, used for classification problems.

    IMPORTANT: do not swap the order of the inputs to this node!

    Usage: nn.SoftmaxLoss(logits, labels)
    Inputs:
        logits: a Node with shape (batch_size x num_classes). Each row
            represents the scores associated with that example belonging to a
            particular class. A score can be an arbitrary real number.
        labels: a Node with shape (batch_size x num_classes) that encodes the
            correct labels for the examples. All entries must be non-negative
            and the sum of values along each row should be 1.
    Output: a scalar Node (containing a single floating-point number)
    """
    @staticmethod
    def log_softmax(logits):
        log_probs = logits - np.max(logits, axis=1, keepdims=True)
        log_probs -= np.log(np.sum(np.exp(log_probs), axis=1, keepdims=True))
        return log_probs

    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[0].shape == inputs[1].shape, (
            "Input shapes should match, instead got {} and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        assert np.all(inputs[1] >= 0), (
            "All entries in the labels input must be non-negative")
        assert np.allclose(np.sum(inputs[1], axis=1), 1), (
            "Labels input must sum to 1 along each row")
        log_probs = SoftmaxLoss.log_softmax(inputs[0])
        return np.mean(-np.sum(inputs[1] * log_probs, axis=1))

    @staticmethod
    def _backward(gradient, *inputs):
        assert np.asarray(gradient).ndim == 0
        log_probs = SoftmaxLoss.log_softmax(inputs[0])
        return [
            gradient * (np.exp(log_probs) - inputs[1]) / inputs[0].shape[0],
            gradient * -log_probs / inputs[0].shape[0]
        ]

 def gradients(loss, parameters):
    """
    Computes and returns the gradient of the loss with respect to the provided
    parameters.

    Usage: nn.gradients(loss, parameters)
    Inputs:
        loss: a SquareLoss or SoftmaxLoss node
        parameters: a list (or iterable) containing Parameter nodes
    Output: a list of Constant objects, representing the gradient of the loss
        with respect to each provided parameter.
    """

    assert isinstance(loss, (SquareLoss, SoftmaxLoss)), (
        "Loss must be a loss node, instead has type {!r}".format(
            type(loss).__name__))
    assert all(isinstance(parameter, Parameter) for parameter in parameters), (
        "Parameters must all have type {}, instead got types {!r}".format(
            Parameter.__name__,
            tuple(type(parameter).__name__ for parameter in parameters)))
    assert not hasattr(loss, "used"), (
        "Loss node has already been used for backpropagation, cannot reuse")

    loss.used = True

    nodes = set()
    tape = []

    def visit(node):
        if node not in nodes:
            for parent in node.parents:
                visit(parent)
            nodes.add(node)
            tape.append(node)

    visit(loss)
    nodes |= set(parameters)

    grads = {node: np.zeros_like(node.data) for node in nodes}
    grads[loss] = 1.0

    for node in reversed(tape):
        parent_grads = node._backward(
            grads[node], *(parent.data for parent in node.parents))
        for parent, parent_grad in zip(node.parents, parent_grads):
            grads[parent] += parent_grad

    return [Constant(grads[parameter]) for parameter in parameters]

 def as_scalar(node):
    """
    Returns the value of a Node as a standard Python number. This only works
    for nodes with one element (e.g. SquareLoss and SoftmaxLoss, as well as
    DotProduct with a batch size of 1 element).
    """

    assert isinstance(node, Node), (
        "Input must be a node object, instead has type {!r}".format(
            type(node).__name__))
    assert node.data.size == 1, (
        "Node has shape {}, cannot convert to a scalar".format(
            format_shape(node.data.shape)))
    node.data = node.data.flatten()
    return node.data.tolist()[0]
--- a/frontend/uct/transformer.py
+++ b/frontend/uct/transformer.py
--- a/frontend/uct/utils.py
+++ b/frontend/uct/utils.py
@@ -0,0 +1,45 @@
 import numpy as np
 import uctc.nn as nn
 np.random.seed(42)
 def parameter_data(*shape):
    assert len(shape) == 2, (
            "Shape must have 2 dimensions, instead has {}".format(len(shape)))
    assert all(isinstance(dim, int) and dim > 0 for dim in shape), (
            "Shape must consist of positive integers, got {!r}".format(shape))
    limit = np.sqrt(3.0 / np.mean(shape))
    data = np.random.uniform(low=-limit, high=limit, size=shape).astype(np.float32)
    return data

 class Dataset(object):
    def __init__(self, x, y):
        assert isinstance(x, np.ndarray)
        assert isinstance(y, np.ndarray)
        assert np.issubdtype(x.dtype, np.floating)
        assert np.issubdtype(y.dtype, np.floating)
        assert x.ndim == 2
        assert y.ndim == 2
        assert x.shape[0] == y.shape[0]
        self.x = x
        self.y = y

    def iterate_once(self, batch_size):
        assert isinstance(batch_size, int) and batch_size > 0, (
            f"Batch size should be a positive integer, got {batch_size}")
        assert self.x.shape[0] % batch_size == 0, (
            f"Dataset size {self.x.shape[0]} is not divisible by batch size {batch_size}")
        index = 0
        while index < self.x.shape[0]:
            x = self.x[index:index + batch_size]
            y = self.y[index:index + batch_size]
            yield nn.Constant(x), nn.Constant(y)
            index += batch_size

    def iterate_forever(self, batch_size):
        while True:
            yield from self.iterate_once(batch_size)

    def get_validation_accuracy(self):
        raise NotImplementedError(
            "No validation data is available for this dataset. "
            "In this assignment, only the Digit Classification and Language "
            "Identification datasets have validation data.")
--- a/lab-guide/00-intro.md
+++ b/lab-guide/00-intro.md
@@ -0,0 +1,36 @@
 ### Welcome to uct lab

 > uct 是Undergraduate Computing Torch的简写。

 欢迎你选择uct作为自己的大实验，在这个大实验中，我们将亲自动手使用C++搭建一个机器学习框架，并完成手写体数据集MNIST的识别。

 注意：你不需要获得任何对于神经网络的前置知识，考虑到《大学计算（下）》面向的是本科一年级学生，我们设计了非常详细的实验指导书帮助你完成这个实验。

 #### 安装构建工具

 大型的C++项目显然不止是几个文件，而是成百上千个文件，因此我们需要一个工具来管理这些文件。有很多课程会使用到类似的工具（在《操作系统》课程上，你将会遇见Makefile；在《编译原理》、《并行编译与优化》上，你将会用到CMake），在这里我们选择CMake。

 > CMake 是一个开源的跨平台构建系统生成工具，广泛用于管理软件构建过程。它通过生成标准的构建文件（如 Makefile、Visual Studio 项目文件等）来简化跨平台项目的构建流程。

 > 对于经验丰富的同学，如果你喜欢使用别的构建工具（例如Bazel）也是可以的~

 假如你也正在使用WSL(2)，运行下面的命令可以安装好所需要的工具和库

 ```bash
 sudo apt update
 sudo apt install -y build-essential cmake git gcc g++
 ```

 #### 准备Python环境

 首先，你需要在Linux下具备Python环境。相信在《大学计算（上）》中，你已经具备这样的技能。我们以使用WSL+VSCode为例介绍环境配置的具体方案。

 在VSCode中连接WSL，打开对应目录。

 使用`conda`创建一个环境（或使用已有环境），然后执行

 ```
 pip install pybind11
 ```

 而后，通过`pip show pybind11`可以找到`pybind11`的安装路径，将对应的头文件路径添加到`.vscode/c_cpp_properties.json`的`includePath`中。
--- a/lab-guide/01-fundamentals.md
+++ b/lab-guide/01-fundamentals.md
@@ -0,0 +1,117 @@
 ### 第一部分：基本操作

 #### 基本函数的构建

 在这一部分中，我们将完成基本的四则运算和由它们组合而成的初等函数的构建。你需要在cc/operators中补全`ops.h`和`ops.cc`的内容。

 **[TASK 1]** 在`ops.h`中，你需要补全以下函数的实现：

 - `mul`函数，输入为两个数`a`、`b`，输出为它们的乘积。

 - `id`函数，将输入原样输出。

 - `add`函数，输入为两个数`a`、`b`，输出为它们的和。

 - `neg`函数，输入为`a`，输出为`-a`。

 - `lt`函数，输入为两个数`a`、`b`，输出为`(float)(a < b)`。

 - `eq`函数，输入为两个数`a`、`b`，输出为`(float)(a == b)`。

 - `max`函数，输入为两个数`a`、`b`，输出为`a`和`b`中较大的那个。

 它们都是模板函数，相信你已经注意到了，它们都被定义在`.h`文件中，而不是`.cc`文件中，这与C++的模板的实例化机制和编译模型有关。

 模板的实例化机制：模板函数或模板类并不是真正的代码，而是一个“蓝图”或“模式”，编译器在编译时根据这个蓝图生成具体的代码。这个过程称为模板实例化。例如，当你使用一个模板函数时，编译器会根据你传递的类型参数生成一个具体的函数版本。这个生成的过程发生在编译时。

 编译模型：C++采用的是分离编译模型，即每个源文件（.cc 或 .cpp 文件）是独立编译的。编译器在编译一个源文件时，只会看到该源文件及其包含的头文件中的内容。如果你将模板函数的定义放在源文件中，其他源文件在编译时无法看到模板的定义，因此无法生成对应的实例化代码。

 另外，你应当还注意到了我们为这两个文件提供了名叫`operators`的命名空间（namespace）。主要是为了防止不同命名空间中的重名冲突。

 **[TASK 2]** 在`ops.cc`中，你需要完成以下函数的实现：

 - `is_close`函数，输入为两个数`x`、`y`，输出为`(float)(abs(x - y) < epsilon)`。

 - `sigmoid`函数，输入为`x`，为了方便计算，在输出时遵照下面的规则：

 $$
 f(x) =\left\{\begin{matrix}
 \frac{1.0}{(1.0 + e^{-x})}, x\ge 0
 \\
 \frac{e^x}{(1.0 + e^{x})}, \mathrm{otherwise}
 \end{matrix}\right.
 $$

 - `relu`函数，输入为`x`，输出为`x > 0.0 ? x : 0.0`。

 - `inv`函数，输入为`x`，输出为`1.0 / x`。

 - `inv_back`函数，用于计算$f(x)=\frac{1}{x}$的微分$f(x)\mathrm{d}x$，输入为`x`和`d`，输出为$-\frac{d}{x^2}$。

 - `relu_back`函数，输入为`x`和`d`，输出为`x > 0.0 ? d*1.0 : 0.0`。

 #### 函数式编程基础

 实现`map`、`zipWith`和`reduce`。

 `map`接受一个`std::vector`和一个函数作为输入，返回一个新的`std::vector`，其中每个元素都是输入函数应用于输入`std::vector`中对应元素的结果。具体来说，对于下面这个实现：

 ```cpp
 template<typename T, typename F>
 auto map(const std::vector<T>& vec, F func) -> std::vector<decltype(func(std::declval<T>()))> {

    std::vector<decltype(func(std::declval<T>()))> result;
    result.reserve(vec.size());

    std::transform(vec.begin(), vec.end(), std::back_inserter(result), func);

    return result;
 }
 ```

 有几处可能让你感到疑惑的地方。

 首先，这里的函数返回值居然和Python一样被后置了！`->` 是 C++11 引入的尾置返回类型语法。它的作用是将函数的返回类型放在函数参数列表之后，而不是放在函数名之前。在某些情况下，返回类型可能依赖于函数参数或模板参数，而这些信息在函数名之前是不可用的。尾置返回类型允许我们在函数参数列表之后推导返回类型。

 > 例如，在`map`函数中，返回类型依赖于`func`的返回类型，而`func`的类型在函数名之前是未知的。使用尾置返回类型可以解决这个问题。

 其次，我们使用了`std::declval`。`std::declval`是 C++11 引入的一个工具，用于在编译时模拟一个对象的“假实例”，以便在不实际构造对象的情况下推导类型。

 ```cpp
 decltype(func(std::declval<T>()))
 ```

 > 在`map`函数中，我们需要推导`func`的返回类型。假设`func`是一个函数对象，接受`T`类型的参数并返回某种类型`R`，我们可以使用`std::declval`来模拟调用`func`的过程。

 **[TASK 3]** 在`ops.cc`中，调用我们给出的`map`函数实现和你刚刚完成的`neg`函数，补全`negList`函数（大约需要1行代码）。

 **[TASK 4]** 在`ops.h`中，仿照`map`函数，补全`zipWith`函数（大约需要10行代码）。`zipWidth`函数接受两个`vector`和一个函数`func`作为输入，要得到一个新的`vector`，这个`vector`中的元素都是两个`vector`逐元素进行函数`func`操作之后的结果。例如，对于`vec1 = [1, 2, 3]`，`vec2 = [5, 6, 7]`，`func`为`add`，那么将返回`[6, 8, 10]`。注意：在进行`zipWith`函数的实现时，你需要考虑输入的两个`std::vector`长度不一致的情况，对于这种情况，你简单地`throw`一个异常即可。

 **[TASK 5]** 在`ops.cc`中，使用你实现的`zipWith`和`add`函数，实现`addLists`函数（大约需要1行代码）。

 **[TASK 6]** 实际上你会发现`std::accumulate`（问一问LLM这个是个啥）就能够承担`reduce`函数的功能，因此你可以直接使用`std::accumulate`来实现`reduce`函数。这个任务需要你使用`reduce`函数实现`sumList`（将一个列表中的元素相加）和`prodList`（将一个列表中的元素相乘）函数（大约分别需要1行代码）。

 #### 检查结果

 做完了？很好，切换到`cc`，执行下面的语句来编译框架

 ```
 cmake -S . -B build
 cd build
 make
 ```
 现在，编辑系统环境变量

 ```
 echo 'export PYTHONPATH="??????"' >> ~/.bashrc
 ```

 将??????替换为将刚刚生成的`build`文件夹的绝对目录直接粘贴到这里，这个文件夹的目录应该形如

 ```Python
 /home/hexu/learn/uc-modern-cpp-student/cc/build
 ```

 > 可以切换到`build`目录下，执行`pwd`命令来获取绝对路径。

 好了，不出意外的话，就再也别动`~/.bashrc`了。现在还有一个`frontend/framework/basis/test_task1.py`文件。切换到目录`frontend/framework/basis/`，直接运行task1到task6的文件，如果没有任何报错，说明你已经完成了这一关！🎉
--- a/lab-guide/02-autodiff.md
+++ b/lab-guide/02-autodiff.md
@@ -0,0 +1,56 @@
 ### 第二部分：自动微分

 #### 数值微分

 有时候，我们无需知道一个函数具体的表达式，借助导数的定义，利用计算机可以求解出在某一点的导数值。这种方法称为数值微分。举个例子，对于任何一个$f(x)$，我们当然可以根据定义求出其在$x=x_0$处的导数，即

 $$f'(x)|_{x=x_0} = \frac{f(x_0+\varepsilon)-f(x_0 - \varepsilon)}{2\varepsilon }$$

 其中$\varepsilon$是一个很小的正数。但是，如果$f(x)$的表达式非常复杂，那么我们可能无法直接求出导数。此时，我们可以借助数值微分来求解导数值。下面我们以$f(x)=x^2$为例，演示如何使用数值微分求解导数值。

 ```python
 import numpy as np

 def f(x):
    return x**2

 def numerical_diff(f, x):
    h = 1e-4
    return (f(x+h) - f(x-h)) / (2*h)

 x = 5.0
 ```

 当然，你现在需要用C++来完成这件事。

 **[TASK 7]** 补全`operators/autodiff.h`中的`central_difference`函数，实现数值微分，求出$f(x_1, x_2, ..., x_n)$在第$arg$个参数处的导数值。


 #### 高等数学中的导数

 还记得$z = x + y$，对$x$和$y$分别求导的结果是什么吗？显然，根据多元函数的求导法则，有$\frac{\partial z}{\partial x}=1$，以及$\frac{\partial z}{\partial y}=1$。如果我们再考虑梯度，那么$z$的梯度就是$\nabla z = (1, 1)$。那么，对于更复杂的函数，比如$f(x, y) = x^2 + y^2$，其梯度$\nabla f$又是什么呢？

 **[TASK 8]** 补全`operators/autodiff.h`中的`Add`类，能够对表达式$z = x + y$求导。

 提示：补全`forward`和`backward`函数，分别实现前向传播和反向传播。前向传播：得到`a + b`的值；反向传播，得到`a`和`b`的梯度（也就是`a`、`b`分别对于结果的导数再乘上梯度`d_input`）。

 **[TASK 9]** 仿照`Add`类构造`operators/autodiff.h`中的`Mul`类，能够对表达式$z = x \cdot y$求导。

 **[TASK 10]** 仿照`Add`类构造`operators/autodiff.h`中的`Log`类，能够对表达式$z = log(x)$求导。提示：使用`<cmath>`提供的`logf`函数。

 **[TASK 11]** 仿照`Add`类构造`operators/autodiff.h`中的`Inv`类，能够对表达式$z = 1 / x$求导。

 **[TASK 12]** 仿照`Add`类构造`operators/autodiff.h`中的`Sigmoid`类，能够对表达式$z = sigmoid(x)$求导。提示：使用`<cmath>`提供的`expf`函数。

 #### 检查结果

 做完了？很好，切换到`cc`，执行下面的语句来编译框架

 ```
 cmake -S . -B build
 cd build
 make
 ```
 如果你已经完成了01，那么环境变量应该是好的。否则，请回到01的实验手册，查看如何修改环境变量。

 现在还有一个`frontend/framework/autodiff/test_task7.py`文件。切换到目录`frontend/framework/autodiff/`，直接运行相应的task文件，如果没有任何报错，说明你已经完成了这一关！🎉
--- a/lab-guide/03-framework.md
+++ b/lab-guide/03-framework.md
@@ -0,0 +1,125 @@
 ### 第三部分：进入人工智能的世界

 > 前两关是不是很简单？

 相信你在前两部分中，已经积累了足够多的C++知识，也回忆起了足够多的高等数学知识。现在，我们要构造一个框架，这个框架可以接受一个矩阵作为输入，并且支持神经网络中的常见的网络层，例如

 - 线性层（Linear）
 - 激活层（Activation）
 - 损失层（Loss）

 #### 张量类

 我们已经在`cc/tensor/tensor.h`中定义了张量类，这个类可以表示一个多维数组，并且支持常见的数学运算。我们可以在`cc/tensor/tensor.cc`中实现这些运算。当然，我们假定所有的张量都是二维的，这样你就不必考虑各种情况。

 **[TASK 13]** 补全`cc/tensor/tensor.cc`中关于`Tensor::transpose()`的函数实现。它能够将一个张量进行转置。

 **[TASK 14]** 补全`cc/tensor/tensor.cc`中关于`argmax(const std::shared_ptr<Tensor>& tensor, int axis)`的函数实现，它能够返回一个张量在指定维度上的最大值的索引。提示：你可以使用`std::numeric_limits<float>::infinity()`，可以通过LLM来查询它的含义。

 > 前面做了这么多次测试，你是不是该自己学会写测试了？...算了，还是我来帮你写吧...😂

 测试文件：`frontend/framework/tensor/task13_14.py`

 **关于测试用例** 之后的内容的测试用例可以参考`frontend/uct/test`下的文件，或依据自己的需要编写。

 #### 线性层

 线性层是神经网络中最为常见的网络层，它接受一个输入张量，并且输出一个张量。输入两个张量`feature: (batch_size x input_features)`和`weight: (input_features x output_features)`，输出张量`output: (batch_size x output_features)`，实际上就是将`feature`矩阵和`weight`矩阵相乘。

 用公式表示就是$y = Wx + b$。

 **[TASK 15]** 补全`cc/operators/nn.h`中`Linear`类的构造函数和`forward`函数。

 - 构造函数：构造函数接受两个参数`a`和`b`，它们都是`std::shared_ptr<Node>`类型的智能指针，分别表示输入特征和权重。构造函数调用基类`FunctionNode`的构造函数，并将`a`和`b`传递给它。在构造函数中，调用`this->forward()`方法，并将结果赋值给`this->data`。

 - `forward`函数：参见有关线性层的介绍。


 **[TASK 16]** 补全`cc/operators/nn.cc`中`Linear`类的`backward`函数。

 - `backward()`函数实现反向传播，计算梯度并返回。它接受`std::shared_ptr<tensor::Tensor> gradient`作为输入，你需要计算`grad_features`和`grad_weights`，它们分别表示对`features`和`weights`的梯度。

 > 数学Tips：`grad_features`是通过将`gradient`与`weights`的转置相乘得到的。`grad_weights`是通过将`features`的转置与`gradient`相乘得到的。

 完成了这两个任务后，你应该可以在`cc/`下执行

 ```
 cmake -S . -B build
 cmake --build build
 ```

 就能够编译你的代码。然后，你应当可以运行`frontend/uct/perception.py`，它将使用你实现的线性层来训练一个感知机。

 #### 激活层

 激活层是神经网络中常见的网络层，它接受一个输入张量，并且输出一个张量。输入一个张量`x`，输出一个张量`y`，实际上就是将`x`中的每个元素进行某种变换。

 用公式表示就是$y = f(x)$。对于`ReLU`函数来说，$y = max(0, x)$。

 **[TASK 17]** 补全`cc/operators/nn.h`中`ReLU`类的构造函数和`forward`函数。

 - 构造函数：构造函数接受一个参数`a`，它是一个`std::shared_ptr<Node>`类型的智能指针，表示输入特征。构造函数调用基类`FunctionNode`的构造函数，并将`a`传递给它。在构造函数中，调用`this->forward()`方法，并将结果赋值给`this->data`。

 - `forward`函数：参见有关激活层的介绍。

 **[TASK 18]** 补全`cc/operators/nn.cc`中`ReLU`类的`backward`函数。

 - `backward()`函数实现反向传播，计算梯度并返回。它接受`std::shared_ptr<tensor::Tensor> gradient`作为输入，你需要计算`grads`，它表示对`features`的梯度。

 > 数学Tips：`grads`是通过将`gradient`与`x`中大于0的元素对应相乘得到的。

 #### 偏置

 线性层中，我们没有实现偏置项`b`，它是一个向量，它的维度与输出特征的维度相同。偏置项的作用是使得线性层的输出能够更好地拟合数据。

 **[TASK 19]** 补全`cc/operators/nn.h`中`AddBias`类的构造函数和`forward`函数。

 - 构造函数：构造函数接受两个参数`a`和`b`，它们都是`std::shared_ptr<Node>`类型的智能指针，分别表示输入特征和偏置。构造函数调用基类`FunctionNode`的构造函数，并将`a`和`b`传递给它。在构造函数中，调用`this->forward()`方法，并将结果赋值给 `this->data`。

 - `forward`函数：`forward`方法实现前向传播，将偏置添加到输入特征上。`features`和`bias`分别从`this->objects`中获取，`features`的形状为`(batch_size x num_features)`，`bias`的形状为`(1 x num_features)`。在函数中，需要创建一个与`features`形状相同的输出张量`outNode`，使用嵌套循环将`features`的每个元素与`bias`的对应元素相加，结果存储在`outNode`中。最后，返回`outNode`。

 **[TASK 20]** 补全`cc/operators/nn.cc`中`AddBias`类的`backward`函数。

 - `backward()`函数实现反向传播，计算梯度并返回。它接受`std::shared_ptr<tensor::Tensor> gradient`作为输入，你需要计算`grad_features`和`grad_bias`，它们分别表示对`features`和`bias`的梯度。

 > 数学Tips：`grad_features`和`grad_bias`都是`gradient`的拷贝。但是考虑到我们有`batch_size`的存在，因此，在计算`bias`的梯度时，需要将`gradient`的每一列相加，得到`grad_bias`的对应元素。

 #### 损失层——均方误差损失函数

 我们首先实现均方误差损失函数，它接受两个张量`y_pred`和`y_true`，它们分别表示预测值和真实值，输出一个标量，表示预测值与真实值之间的误差。

 用公式表示就是$\displaystyle loss = \frac{1}{2} \sum_{i=1}^{n} (y_{pred} - y_{true})^2$。

 **[TASK 21]** 补全`cc/operators/nn.h`中`SquareLoss`类的构造函数和`forward`函数。

 - 构造函数：构造函数接受两个参数`a`和`b`，它们都是`std::shared_ptr<Node>`类型的智能指针，分别表示预测值和真实值。构造函数调用基类`FunctionNode`的构造函数，并将`a`和`b`传递给它。在构造函数中，调用`this->forward()`方法，并将结果赋值给`this->data`。

 - `forward`函数用于计算损失。

 **[TASK 22]** 补全`cc/operators/nn.cc`中`SquareLoss`类的`backward`函数。

 - `backward`函数计算损失函数相对于输入`a`和`b`的梯度。`gradient`是损失函数对输出的梯度（是一个形状为(1, 1)的张量，可以直接认为其是一个向量`g`）。`grad_a`和`grad_b`分别存储`a`和`b`的梯度。对于每个元素，梯度计算为`g * (a->data->data[i] - b->data->data[i]) / a->data->size`。最终返回 grad_a 和 grad_b 的向量。

 #### 损失层——SoftmaxLoss

 接下来，我们实现Softmax损失函数，它接受两个张量`y_pred`和`y_true`，它们分别表示预测值和真实值，输出一个标量，表示预测值与真实值之间的误差。

 用公式表示就是$\displaystyle loss = -\sum_{i=1}^{n} y_{true} \log(y_{pred})$。

 **[TASK 23]** 补全`cc/operators/nn.h`中`SoftmaxLoss`类的构造函数，`forward`函数和`backward`函数。

 完成上述内容后，你可以编译和运行`frontend/uct/regression.py`，使用线性网络来拟合`sin`函数。

 ### 手写体识别

 补全代码中的其他标注有`TODO`的内容，最后编译运行，你就将能够训练一个手写体识别模型。可以运行`frontend/uct/mnist.py`来试一下吧！

 > 是不是觉得运行得有点慢？考虑使用多线程来加速矩阵运算。（这已经超出了这门课的要求，对高性能计算/并行计算感兴趣的同学可以勇于尝试！）

 ### extra bonus

 想打副本？

 ```
 nslookup -type=txt uc-cpp.shahe.org
 ```