diff --git a/cc/CMakeLists.txt b/cc/CMakeLists.txt
new file mode 100644
index 0000000..5e6444c
--- /dev/null
+++ b/cc/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.19)
+project(uctc)
+set(CMAKE_CXX_STANDARD 17)
+set(CXX g++)
+set(CMAKE_CXX_FLAGS -O3)
+set(PYBIND11_DIR /home/hexu/miniconda3/lib/python3.11/site-packages/pybind11)
+set(PYBIND11_FINDPYTHON ON)
+find_package(pybind11 CONFIG REQUIRED PATHS ${PYBIND11_DIR})
+pybind11_add_module(uctc uctc.cc math/arith.cc operators/nn.cc operators/ops.cc tensor/tensor.cc)
+
+add_custom_command(
+    TARGET uctc POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E echo "Changing directory and running Python script for generate interpreter annotations"
+    COMMAND ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR} pybind11-stubgen uctc --output-dir .
+)
\ No newline at end of file
diff --git a/cc/math/arith.cc b/cc/math/arith.cc
new file mode 100644
index 0000000..56d78c3
--- /dev/null
+++ b/cc/math/arith.cc
@@ -0,0 +1,14 @@
+#include "arith.h"
+
+namespace arith {
+
+float sqrt(float x) {
+    return sqrtf(x);
+}
+
+float mean(const std::vector<int>& x) {
+    return std::accumulate(x.begin(), x.end(), 0) / x.size();
+}
+
+
+}
\ No newline at end of file
diff --git a/cc/math/arith.h b/cc/math/arith.h
new file mode 100644
index 0000000..ffdfec4
--- /dev/null
+++ b/cc/math/arith.h
@@ -0,0 +1,20 @@
+#pragma once
+#include <cmath>
+#include <vector>
+#include <numeric>
+
+namespace arith {
+
+float sqrt(float x);
+float mean(const std::vector<int>& x);
+
+template<typename T>
+void mm(const std::vector<T>& a, const std::vector<T>& b, std::vector<T>& c, size_t m, size_t k, size_t n) {
+    // 补全这里，谢谢
+}
+
+template<typename T>
+void vector_scalar_max(const std::vector<T>& a, std::vector<T> &b, T scalar) {
+    // 补全这里，谢谢
+}
+}
\ No newline at end of file
diff --git a/cc/operators/autodiff.cc b/cc/operators/autodiff.cc
new file mode 100644
index 0000000..03aab51
--- /dev/null
+++ b/cc/operators/autodiff.cc
@@ -0,0 +1,32 @@
+#include "autodiff.h"
+
+namespace autodiff {
+
+std::vector<std::shared_ptr<ScalarFunction>> topoSort(const std::vector<std::shared_ptr<ScalarFunction>>& scalars) {
+    std::vector<std::shared_ptr<ScalarFunction>> sorted;
+    std::vector<std::shared_ptr<ScalarFunction>> frontier;
+    std::unordered_map<std::shared_ptr<ScalarFunction>, int> degree;
+    for (auto it: scalars) {
+        if (it->degree == 0) {
+            frontier.push_back(it);
+        }
+        else {
+            degree.insert({it, it->degree});
+        }
+    }
+    while (!frontier.empty()) {
+        auto back = frontier.back();
+        sorted.push_back(back);
+        for (auto &it: degree) {
+            if (it.second > 0 && it.first == back) {
+                it.second--;
+                if (it.second == 0) {
+                    frontier.push_back(it.first);
+                }
+            }
+        }
+    }
+    return sorted;
+}
+
+}
\ No newline at end of file
diff --git a/cc/operators/autodiff.h b/cc/operators/autodiff.h
new file mode 100644
index 0000000..a485db5
--- /dev/null
+++ b/cc/operators/autodiff.h
@@ -0,0 +1,211 @@
+#pragma once
+#include <vector>
+#include <memory>
+#include <cmath>
+#include <unordered_map>
+
+namespace autodiff {
+
+template<typename T, typename F>
+auto central_difference(std::vector<T>& vec, F func, std::size_t arg, float epsilon = 1e-6) {
+    // 补全函数，并修改return语句
+    return 0;
+}
+
+class ScalarFunction {
+public:
+    float data;
+    float grad;
+    int degree = 0;
+public:
+    ScalarFunction() {}
+}; // class ScalarFunction
+
+class ConstantScalar: public ScalarFunction {
+public:
+    ConstantScalar(float data): ScalarFunction() {
+        this->data = data;
+    }
+}; // class ConstantScalar
+
+class Add: public ScalarFunction {
+public:
+    std::shared_ptr<ScalarFunction> a;
+    std::shared_ptr<ScalarFunction> b;
+public:
+    // 思考这个构造函数的写法（或让LLM进行解释）
+    Add(std::shared_ptr<ScalarFunction> a, std::shared_ptr<ScalarFunction> b): a(a), b(b) {
+        this->data = a->data + b->data;
+        this->degree = 2;
+    }
+    float forward() {
+        // 修改这里的return
+        return 0;
+    }
+    std::vector<float> backward(float d_input) {
+        // 修改这里的return
+        return {0, 0};
+    }
+}; // class Add
+
+class Log: public ScalarFunction {
+public:
+    std::shared_ptr<ScalarFunction> a;
+public:
+    Log(std::shared_ptr<ScalarFunction> a): a(a) {
+        this->data = this->forward();
+        this->degree = 1;
+    }
+    float forward() {
+        // 补全这里的return语句
+        return 0.0f;
+    }
+    std::vector<float> backward(float d_input) {
+        // 算了，我来帮你写求导的部分吧
+        // 估计你已经忘记$log(x)$求导是什么了
+        return {(1.0f * d_input / a->data)};
+    }
+}; // class Log
+
+class Mul: public ScalarFunction {
+public:
+    std::shared_ptr<ScalarFunction> a;
+    std::shared_ptr<ScalarFunction> b;
+public:
+    Mul(std::shared_ptr<ScalarFunction> a, std::shared_ptr<ScalarFunction> b) : a(a), b(b) {
+        this->data = this->forward();
+        this->degree = 2;
+    }
+    float forward() {
+        // 修改这里的return
+        return 0;
+    }
+    std::vector<float> backward(float d_input) {
+        // 修改这里的return
+        return {0, 0};
+    }
+}; // class Mul
+
+class Inv: public ScalarFunction {
+public:
+    std::shared_ptr<ScalarFunction> a;
+public:
+    Inv(std::shared_ptr<ScalarFunction> a): a(a) {
+        this->data = this->forward();
+        this->degree = 1;
+    }
+    float forward() {
+        return 1.0f / a->data;
+    }
+    std::vector<float> backward(float d_input) {
+        // 修改这里的return语句
+        // 1/x求导是-1/x^2
+        return {0.0f};
+    }
+}; // class Inv
+
+class Sigmoid: public ScalarFunction {
+public:
+    std::shared_ptr<ScalarFunction> a;
+public:
+    Sigmoid(std::shared_ptr<ScalarFunction> a): a(a) {
+        this->data = this->forward();
+        this->degree = 1;
+    }
+    float forward() {
+        if (this->a->data >= 0.0) {
+            return 1.0 / (1.0 + expf(-this->a->data));
+        }
+        else {
+            return expf(this->a->data) / (1.0 + expf(this->a->data));
+        }
+    }
+    std::vector<float> backward(float d_input) {
+        // 你还是来求一下导吧，预防上大学以后变傻了
+        // 补全这里的代码
+        return {0.0f};
+    }
+}; // class Sigmoid
+
+// for testing
+bool test_central_difference() {
+    std::vector<float> x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
+    auto func = [](const std::vector<float>& x) -> float {
+        return x[0] + x[1] + x[2] + x[3] + x[4];
+    };
+    auto grad = central_difference(x, func, 2);
+    if (abs(grad-1.0f) > 1e-4) {
+        return false;
+    }
+    return true;
+}
+
+bool test_addscalar() {
+    auto a = std::make_shared<ConstantScalar>(1.0f);
+    auto b = std::make_shared<ConstantScalar>(2.0f);
+    auto c = std::make_shared<Add>(a, b);
+    if (c->data != 3.0f) {
+        return false;
+    }
+    auto res = c->backward(2.0f);
+    auto a_grad = res[0];
+    auto b_grad = res[1];
+    if (a_grad != 2.0f || b_grad != 2.0f) {
+        return false;
+    }
+    return true;
+}
+
+bool test_mulscalar() {
+    auto a = std::make_shared<ConstantScalar>(2.0f);
+    auto b = std::make_shared<ConstantScalar>(3.0f);
+    auto c = std::make_shared<Mul>(a, b);
+    if (c->data != 6.0f) {
+        return false;
+    }
+    auto res = c->backward(2.0f);
+    auto a_grad = res[0];
+    auto b_grad = res[1];
+    if (a_grad != 6.0f || b_grad != 4.0f) {
+        return false;
+    }
+    return true;
+}
+
+bool test_logscalar() {
+    auto a = std::make_shared<ConstantScalar>(2.0f);
+    auto b = std::make_shared<Log>(a);
+    if (abs(b->data - logf(2.0f)) > 1e-4) {
+        return false;
+    }
+    auto res = b->backward(2.0f);
+    auto a_grad = res[0]; 
+    if (abs(a_grad - 1.0f) > 1e-4) {
+        return false;
+    }
+    return true;
+}
+
+bool test_invscalar() {
+    auto a = std::make_shared<ConstantScalar>(2.0f);
+    auto b = std::make_shared<Inv>(a);
+    if (abs(b->data - 0.5f) > 1e-4) {
+        return false;
+    }
+    auto res = b->backward(2.0f);
+    auto a_grad = res[0];
+    if (abs(a_grad + 0.5f) > 1e-4) {
+        return false;
+    }
+    return true;
+}
+
+bool test_sigmoidscalar() {
+    auto a = std::make_shared<ConstantScalar>(2.0f);
+    auto b = std::make_shared<Sigmoid>(a);
+    // TODO：麻烦自己写下测试用例，谢谢
+    // 禁止直接return true，世界上最聪明的智能人工将会逐一检查这段代码
+    return false;
+}
+
+}
\ No newline at end of file
diff --git a/cc/operators/nn.cc b/cc/operators/nn.cc
new file mode 100644
index 0000000..a341eb6
--- /dev/null
+++ b/cc/operators/nn.cc
@@ -0,0 +1,87 @@
+#include "nn.h"
+
+namespace nn {
+
+std::shared_ptr<tensor::Tensor> log_softmax(std::shared_ptr<tensor::Tensor> logits) {
+    auto batch_size = logits->shape[0];
+    auto num_classes = logits->shape[1];
+    auto log_probs_shape = {batch_size, num_classes};
+    auto log_probs = std::make_shared<tensor::Tensor>(log_probs_shape);
+
+    for (auto i = 0; i < batch_size; i++) {
+        auto max_logit = logits->data[i * num_classes];
+        for (auto j = 1; j < num_classes; j++) {
+            max_logit = max_logit > logits->data[i * num_classes + j] ? max_logit : logits->data[i * num_classes + j];
+        }
+
+        auto sum_exp = 0.0;
+        for (auto j = 0; j < num_classes; j++) {
+            log_probs->data[i * num_classes + j] = logits->data[i * num_classes + j] - max_logit;
+            sum_exp += exp(log_probs->data[i * num_classes + j]);
+        }
+
+        // calculate log(softmax)
+        auto log_sum_exp = log(sum_exp);
+        for (auto j = 0; j < num_classes; j++) {
+            log_probs->data[i * num_classes + j] -= log_sum_exp;
+        }
+    }
+
+    return log_probs;
+}
+
+std::vector<std::shared_ptr<tensor::Tensor>> gradients(std::shared_ptr<Loss> loss, std::vector<std::shared_ptr<Node>> parameters) {
+    loss->used = true;
+
+    std::unordered_set<std::shared_ptr<Node>> nodes;
+    std::vector<std::shared_ptr<Node>> tape;
+
+    // 递归遍历图并构建计算图
+    std::function<void(std::shared_ptr<Node>)> visit = [&](std::shared_ptr<Node> node) {
+        if (nodes.find(node) == nodes.end()) {
+            for (const auto& parent : node->get_parents()) {
+                visit(parent);
+            }
+            nodes.insert(node);
+            tape.push_back(node);
+        }
+    };
+
+    visit(loss);
+    for (const auto& param : parameters) {
+        nodes.insert(param);
+    }
+
+    std::unordered_map<std::shared_ptr<Node>, std::shared_ptr<tensor::Tensor>> grads;
+    for (const auto& node : nodes) {
+        grads[node] = std::make_shared<tensor::Tensor>(node->data->shape);
+    }
+    grads[loss] = std::make_shared<tensor::Tensor>(loss->data->shape);
+    grads[loss]->data[0] = 1.0;
+
+    for (auto it = tape.rbegin(); it != tape.rend(); it++) {
+        // std::cout << "tape it: " << std::endl;
+        auto node = *it;
+        // if (node->data->shape[0] == 1) {
+        //     std::cout << "coming to squareloss" << std::endl;
+        // }
+        auto parent_grads = node->backward(grads[node]);
+        auto parents = node->get_parents();
+        for (size_t i = 0; i < parents.size(); i++) {
+            // std::cout << "this grad shape: " << grads[parents[i]]->data.size() << std::endl;
+            for (auto ind = 0; ind < parents[i]->data->size; ind++) {
+                grads[parents[i]]->data[ind] += parent_grads[i]->data[ind];
+            }
+        }
+    }
+
+    std::vector<std::shared_ptr<tensor::Tensor>> result;
+    for (const auto& param : parameters) {
+        result.emplace_back(grads[param]);
+    }
+
+    // std::cout << "len(result): " << result.size() << std::endl;
+    return result;
+}
+
+}
\ No newline at end of file
diff --git a/cc/operators/nn.h b/cc/operators/nn.h
new file mode 100644
index 0000000..e4824e8
--- /dev/null
+++ b/cc/operators/nn.h
@@ -0,0 +1,274 @@
+#pragma once
+#include <vector>
+#include <memory>
+#include <unordered_set>
+#include <unordered_map>
+#include <algorithm>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <iostream>
+#include "../tensor/tensor.h"
+#include "../math/arith.h"
+
+namespace py = pybind11;
+
+namespace nn {
+
+class Node {
+public:
+    std::shared_ptr<tensor::Tensor> data;
+    std::vector<std::shared_ptr<Node>> objects;
+    std::vector<std::shared_ptr<tensor::Tensor>> gradient;
+public:
+    Node() {}
+    virtual std::shared_ptr<tensor::Tensor> forward() = 0;
+    virtual std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) = 0;
+    std::vector<std::shared_ptr<Node>> get_parents() {
+        return this->objects;
+    }
+    std::vector<float> get_data() {
+        return this->data->data;
+    }
+    std::shared_ptr<tensor::Tensor> get_tensor() {
+        return this->data;
+    }
+    // virtual void update(std::shared_ptr<tensor::Tensor> grad, float lr) = 0;
+    // virtual void zero_grad() = 0;
+    virtual ~Node() {}
+};
+
+class DataNode: public Node {
+public:
+    DataNode() {}
+}; // class DataNode
+
+class Parameter: public DataNode {
+public:
+    // Parameter(const std::vector<std::size_t>& shape) {
+    //     this->data = std::make_shared<tensor::Tensor>(shape, true);
+    // }
+    Parameter(py::array_t<float> array) {
+        py::buffer_info info = array.request();
+        float* dataPtr = static_cast<float*>(info.ptr);
+        std::vector<std::size_t> shape = {};
+        for (auto &it: info.shape) {
+            shape.push_back(it);
+        }
+        auto tensor = std::make_shared<tensor::Tensor>(shape);
+        std::vector<float> result(dataPtr, dataPtr + info.size);
+        tensor->data = result;
+        this->data = tensor;
+    }
+    std::shared_ptr<tensor::Tensor> forward() {
+        return this->data;
+    };
+    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) {
+        return {gradient};
+    };
+    void update(std::shared_ptr<tensor::Tensor> grad, double lr) {
+        for (auto i = 0; i < this->data->size; i++) {
+            this->data->data[i] -= lr * grad->data[i];
+        }
+    }
+}; // class Parameter
+
+class Constant: public DataNode {
+public:
+    Constant(std::shared_ptr<tensor::Tensor> data) {
+        this->data = data;
+    }
+    Constant(py::array_t<float> array) {
+        this->data = tensor::pyarray_to_tensor(array);
+    }
+    std::shared_ptr<tensor::Tensor> forward() {
+        return this->data;
+    };
+    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) {
+        return {gradient};
+    };
+    // void update(std::shared_ptr<tensor::Tensor> grad, float lr) {}
+}; // class Constant
+
+class FunctionNode: public Node {
+public:
+    FunctionNode(std::shared_ptr<Node> a, std::shared_ptr<Node> b) {
+        this->objects.emplace_back(a);
+        this->objects.emplace_back(b);
+    }
+    FunctionNode(std::shared_ptr<Node> a) {
+        this->objects.emplace_back(a);
+    }
+
+    std::shared_ptr<tensor::Tensor> forward() override {
+        return nullptr;
+    }
+}; //class FunctionNode
+
+class Add: public FunctionNode {
+public:
+    Add(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
+        this->data = this->forward();
+    }
+    std::shared_ptr<tensor::Tensor> forward() override {
+        auto a = this->objects[0];
+        auto b = this->objects[1];
+        auto outNode = std::make_shared<tensor::Tensor>(a->data->shape);
+        for (auto i = 0; i < a->data->size; i++) {
+            outNode->data[i] = a->data->data[i] + b->data->data[i];
+        }
+        return outNode;
+    }
+    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
+        // assertion needed
+        return {gradient, gradient};
+    }
+};
+
+class AddBias: public FunctionNode {
+public:
+    AddBias(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
+        this->data = this->forward();
+    }
+    std::shared_ptr<tensor::Tensor> forward() override {
+        // features: a Node with shape (batch_size x num_features)
+        // bias: a Node with shape (1 x num_features)
+        auto features = this->objects[0];
+        auto bias = this->objects[1];
+        auto outNode = std::make_shared<tensor::Tensor>(features->data->shape);
+        // for循环写加法总会写吧🤔
+        // 补全这里的代码
+        return outNode;
+    }
+    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
+        // assertion needed
+        auto g_bias = std::make_shared<tensor::Tensor>(this->objects[1]->data->shape);
+        // 补全这里的代码
+        
+        return {gradient, g_bias};
+    }
+    std::vector<float> get_data() {
+        return this->data->data;
+    }
+}; // class AddBias
+
+
+class Linear: public FunctionNode {
+public:
+    Linear(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {
+        // 这段代码就一行，参考下别的类是怎么写的呢？
+        // 在这里补全
+    }
+    std::shared_ptr<tensor::Tensor> forward() override {
+        // features: (batch_size x input_features)
+        auto features = this->objects[0];
+        // weights: (input_features x output_features)
+        auto weights = this->objects[1];
+        auto m = features->data->shape[0];
+        auto k = features->data->shape[1];
+        auto n = weights->data->shape[1];
+        // std::cout << m << " " << n << " " << k << std::endl;
+        // output: (batch_size x output_features)
+        auto shape = {m, n};
+        auto outNode = std::make_shared<tensor::Tensor>(shape);
+        // 实际上你需要补全的是arith::mm函数，快去找找它在哪里
+        // 其余部分不需要动
+        arith::mm(features->data->data, weights->data->data, outNode->data, m, k, n);
+        return outNode;
+    }
+
+    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
+        auto features = this->objects[0];
+        auto weights = this->objects[1];
+        // gradient.shape[0] == features.shape[0]
+        // gradient.shape[1] == weights.shape[1]
+        auto grad_features_shape = {gradient->shape[0], weights->data->shape[0]};
+        auto grad_features = std::make_shared<tensor::Tensor>(grad_features_shape);
+        auto grad_weights_shape = {features->data->shape[1], gradient->shape[1]};
+        auto grad_weights = std::make_shared<tensor::Tensor>(grad_weights_shape);
+        // 这里要调用两次arith:mm，是分别把哪两个矩阵相乘呢？
+        return {grad_features, grad_weights};
+    }
+}; //class Linear
+
+class ReLU: public FunctionNode {
+public:
+    ReLU(std::shared_ptr<Node> a) : FunctionNode(a) {
+        // 补全这里
+    }
+    std::shared_ptr<tensor::Tensor> forward() override {
+        // x: a Node with shape (batch_size x num_features)
+        auto outNode = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape);
+        // 补全这里，调用arith::vector_scalar_max
+        return outNode;
+    }
+    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
+        auto grads = std::make_shared<tensor::Tensor>(this->objects[0]->data->shape);
+        // 补全这里，一个for循环
+        
+        return {grads};
+    }
+}; // class ReLU
+
+class Loss: public FunctionNode {
+public:
+    bool used = false;
+public:
+    Loss(std::shared_ptr<Node> a, std::shared_ptr<Node> b) : FunctionNode(a, b) {}
+};
+
+class SquareLoss: public Loss {
+public:
+    SquareLoss(std::shared_ptr<Node> a, std::shared_ptr<Node> b): Loss(a, b) {
+        // 补全这里的代码
+    }
+    std::shared_ptr<tensor::Tensor> forward() {
+        // a: a Node with shape (batch_size x dim)
+        // b: a Node with shape (batch_size x dim)
+        // 这个简单，就是要注意返回的res需要是一个tensor就行
+        // 修改下面的代码
+        std::vector<size_t> res_shape = {1};
+        auto res = std::make_shared<tensor::Tensor>(res_shape);
+        return res;
+    }
+    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
+        float g = gradient->data[0];
+        auto a = this->objects[0];
+        auto b = this->objects[1];
+        auto grad_a = std::make_shared<tensor::Tensor>(a->data->shape);
+        auto grad_b = std::make_shared<tensor::Tensor>(b->data->shape);
+        // 补全下面的代码
+        return {grad_a, grad_b};
+    }
+}; // class SquareLoss
+
+std::shared_ptr<tensor::Tensor> log_softmax(std::shared_ptr<tensor::Tensor> logits);
+
+class SoftmaxLoss: public Loss {
+public:
+    SoftmaxLoss(std::shared_ptr<Node> logits, std::shared_ptr<Node> labels): Loss(logits, labels) {
+        this->data = this->forward();
+    }
+
+    std::shared_ptr<tensor::Tensor> forward() {
+        // 我们已经帮你写好log_softmax
+        auto log_probs = log_softmax(this->objects[0]->data);
+        // 补全下面的代码，计算softmax loss
+        std::vector<size_t> res_shape = {1};
+        auto res = std::make_shared<tensor::Tensor>(res_shape);
+        return res;
+    }
+    std::vector<std::shared_ptr<tensor::Tensor>> backward(std::shared_ptr<tensor::Tensor> gradient) override {
+        auto log_probs = log_softmax(this->objects[0]->data);
+        auto labels = this->objects[1]->data;
+        auto batch_size = log_probs->shape[0];
+        auto num_classes = log_probs->shape[1];
+        auto grad_logits = std::make_shared<tensor::Tensor>(log_probs->shape);
+        auto grad_labels = std::make_shared<tensor::Tensor>(labels->shape);
+        // 补全下面的代码
+        return {grad_logits, grad_labels};
+    }
+}; // class SoftmaxLoss
+
+std::vector<std::shared_ptr<tensor::Tensor>> gradients(std::shared_ptr<Loss> loss, std::vector<std::shared_ptr<Node>> parameters);
+
+}
\ No newline at end of file
diff --git a/cc/operators/ops.cc b/cc/operators/ops.cc
new file mode 100644
index 0000000..36890b9
--- /dev/null
+++ b/cc/operators/ops.cc
@@ -0,0 +1,54 @@
+#include "ops.h"
+
+namespace operators {
+static float epsilon = 1e-6;
+
+float is_close(float x, float y) {
+    // 请修改这里的return语句
+    return 0.0;
+}
+
+float sigmoid(float x) {
+    // 请修改这里的return语句
+    return 0.0;
+}
+
+float relu(float x) {
+    // 请修改这里的return语句
+    return 0.0;
+}
+
+float inv(float x) {
+    // 请修改这里的return语句
+    return 0.0;
+}
+
+float inv_back(float x, float d) {
+    // 请修改这里的return语句
+    return 0.0;
+}
+
+float relu_back(float x, float d) {
+    // 请修改这里的return语句
+    return 0.0;
+}
+
+auto sumList(const std::vector<float>& vec) -> float {
+    return reduce(vec, 0.0f, add<float>);
+}
+
+auto prodList(const std::vector<float>& vec) -> float {
+    // 请修改这里的return语句
+    return 0.0f;
+}
+
+auto addLists(const std::vector<float>& vec1, const std::vector<float>& vec2) -> std::vector<float> {
+    // 请修改这里的return语句
+    return std::vector<float>(1, 0.0f);
+}
+
+auto negList(const std::vector<float>& vec) -> std::vector<float> {
+    // 请修改这里的return语句
+    return std::vector<float>(1, 0.0f);
+}
+}
diff --git a/cc/operators/ops.h b/cc/operators/ops.h
new file mode 100644
index 0000000..7933ca1
--- /dev/null
+++ b/cc/operators/ops.h
@@ -0,0 +1,88 @@
+#pragma once
+#include <cmath>
+#include <functional>
+#include <vector>
+#include <algorithm>
+#include <stdexcept>
+#include <numeric>
+
+namespace operators {
+
+template<typename T>
+T mul(T a, T b) {
+    return 0; // 请修改这里的return语句
+}
+
+template<typename T>
+T id(T a) {
+    return 0; // 请修改这里的return语句
+}
+
+template<typename T>
+T add(T a, T b) {
+    return 0; // 请修改这里的return语句
+}
+
+template<typename T>
+T neg(T a) {
+    return 0; // 请修改这里的return语句
+}
+
+template<typename T>
+float lt(T a, T b) {
+    return 0.0; // 请修改这里的return语句
+}
+
+template<typename T>
+float eq(T a, T b) {
+    return 0.0; // 请修改这里的return语句
+}
+
+template<typename T>
+T max(T a, T b) {
+    return 0; // 请修改这里的return语句
+}
+
+template<typename T, typename F>
+auto map(const std::vector<T>& vec, F func) -> std::vector<decltype(func(std::declval<T>()))> {
+
+    std::vector<decltype(func(std::declval<T>()))> result;
+    result.reserve(vec.size());
+
+    std::transform(vec.begin(), vec.end(), std::back_inserter(result), func);
+
+    return result;
+}
+
+template <typename T1, typename T2, typename F>
+auto zipWith(const std::vector<T1>& vec1, const std::vector<T2>& vec2, F func)
+    -> std::vector<decltype(func(std::declval<T1>(), std::declval<T2>()))> {
+
+    if (vec1.size() != vec2.size()) {
+        // 我们已经在这里throw一个异常
+        throw std::invalid_argument("Vectors must have the same size");
+    }
+    // 请在这里补全其他部分
+    // 提醒：可以使用push_back函数向vector添加元素
+    // 再给你降一点难度：这里需要仿照map函数神明一个result变量。
+
+    return std::vector<decltype(func(std::declval<T1>(), std::declval<T2>()))>(1); // 这里记得改掉，改成result
+}
+
+template<typename T, typename F>
+auto reduce(const std::vector<T>& vec, T init, F func) -> T {
+    return std::accumulate(vec.begin(), vec.end(), init, func);
+}
+
+float is_close(float x, float y);
+float sigmoid(float x);
+float relu(float x);
+float inv(float x);
+float inv_back(float x, float d);
+float relu_back(float x, float d);
+
+auto sumList(const std::vector<float>& vec) -> float;
+auto prodList(const std::vector<float>& vec) -> float;
+auto addLists(const std::vector<float>& vec1, const std::vector<float>& vec2) -> std::vector<float>;
+auto negList(const std::vector<float>& vec) -> std::vector<float>;
+}
diff --git a/cc/tensor/pyarray.cc b/cc/tensor/pyarray.cc
new file mode 100644
index 0000000..a41dd61
--- /dev/null
+++ b/cc/tensor/pyarray.cc
@@ -0,0 +1,12 @@
+#include "pyarray.h"
+
+namespace pyarr {
+
+std::vector<float> ndarray_to_vector(py::array_t<float> array) {
+    py::buffer_info info = array.request();
+    float* dataPtr = static_cast<float*>(info.ptr);
+    std::vector<float> result(dataPtr, dataPtr + info.size);
+    return result;
+}
+
+}
\ No newline at end of file
diff --git a/cc/tensor/pyarray.h b/cc/tensor/pyarray.h
new file mode 100644
index 0000000..d8d53cf
--- /dev/null
+++ b/cc/tensor/pyarray.h
@@ -0,0 +1,10 @@
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace pyarr {
+
+std::vector<float> ndarray_to_vector(py::array_t<float> array);
+
+}
\ No newline at end of file
diff --git a/cc/tensor/tensor.cc b/cc/tensor/tensor.cc
new file mode 100644
index 0000000..bedf5ff
--- /dev/null
+++ b/cc/tensor/tensor.cc
@@ -0,0 +1,76 @@
+#include "tensor.h"
+
+namespace tensor {
+
+std::shared_ptr<Tensor> Tensor::transpose() {
+    // 放心，下面的代码暂时不会被触发，我们假定所有的tensor都是2维的
+    // if (shape.size() != 2) {
+    //     throw std::runtime_error("Transpose is only supported for 2D tensors.");
+    // }
+
+    // 这里能够获得矩阵的行数和列数，但是我们是使用一个一维的vector来存储数据的。该如何实现“转置”呢？
+    std::size_t rows = shape[0];
+    std::size_t cols = shape[1];
+    std::vector<size_t> new_shape = {cols, rows};
+    // 你知道这里的size变量在哪里定义的吗？在VSCode里面安装C/C++ Extension Pack后，按下ctrl键并单击变量size，VSCode就会把你导向定义这个变量的地方！
+    std::vector<float> transposed_data(size);
+
+    // 请在这里写转置的代码
+
+    // 请阅读关于Tensor的定义，在这里创建一个新的Tensor
+    // 注意，要使用shared_ptr哦！
+    
+    return std::make_shared<Tensor>(new_shape);
+}
+
+
+std::shared_ptr<Tensor> pyarray_to_tensor(py::array_t<float> array) {
+    py::buffer_info info = array.request();
+    float* dataPtr = static_cast<float*>(info.ptr);
+    std::vector<std::size_t> shape = {};
+    for (auto &it: info.shape) {
+        shape.push_back(it);
+    }
+    auto tensor = std::make_shared<Tensor>(shape);
+    std::vector<float> result(dataPtr, dataPtr + info.size);
+    tensor->data = result;
+    return tensor;
+}
+
+std::shared_ptr<Tensor> argmax(const std::shared_ptr<Tensor>& tensor, int axis) {
+    // you only need to handle the two dimensional tensor, and the axis can be either 0 or 1
+    // the tensor's shape is (batch_size, features)
+    // if the axis is 0, it outputs a tensor (1, features)
+    // if the axis is 1, it outputs a tensor (batch_size, 1)
+
+    // compute the output's shape
+    std::vector<std::size_t> output_shape = tensor->shape;
+    output_shape.erase(output_shape.begin() + axis);
+
+    auto result = std::make_shared<Tensor>(output_shape);
+    // 这个问题似乎有点难，所以我们决定给你送点分。一个简单的办法是分axis为0还是为1来进行讨论，反正我们已经把问题简化为了，在一个二维的tensor里面，找到每一行或者每一列的最大值，并输出一个一维的tensor。
+    // 补全这里的代码。
+    return result;
+}
+
+std::shared_ptr<Tensor> mean(const std::shared_ptr<Tensor>& tensor) {
+    std::vector<std::size_t> shape = {1};
+    auto result = std::make_shared<Tensor>(shape);
+    auto sum = 0.0f;
+    for (auto &it: tensor->data) {
+        sum += it;
+    }
+    sum /= tensor->size;
+    result->data[0] = sum;
+    return result;
+}
+
+std::shared_ptr<Tensor> exp(const std::shared_ptr<Tensor>& tensor) {
+    auto result = std::make_shared<Tensor>(tensor->shape);
+    for (auto i = 0; i < tensor->size; i++) {
+        result->data[i] = expf(tensor->data[i]);
+    }
+    return result;
+}
+
+}
\ No newline at end of file
diff --git a/cc/tensor/tensor.h b/cc/tensor/tensor.h
new file mode 100644
index 0000000..adaef3c
--- /dev/null
+++ b/cc/tensor/tensor.h
@@ -0,0 +1,92 @@
+#pragma once
+#include <numeric>
+#include <random>
+#include <vector>
+#include <memory>
+#include <stdexcept>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+
+namespace py = pybind11;
+
+namespace tensor {
+
+class Tensor {
+public:
+    std::vector<float> data;
+    std::vector<std::size_t> shape;
+    std::size_t size;
+
+public:
+    Tensor(const std::vector<std::size_t>& shape, bool rand_init = false) {
+        this->size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+        this->data.resize(this->size);
+        this->shape = shape;
+        if (rand_init) {
+            double limit = std::sqrt(3.0 / ((shape[0] + shape[1]) / 2.0));
+            std::mt19937 gen(42);
+            std::uniform_real_distribution<float> dis(-limit, limit);
+            for (std::size_t i = 0; i < this->size; ++i) {
+                this->data[i] = dis(gen);
+            }
+        }
+    }
+    std::shared_ptr<Tensor> transpose();
+
+    Tensor operator+(const Tensor& other) const {
+        if (this->shape != other.shape) {
+            throw std::runtime_error("Shapes do not match");
+        }
+        Tensor result(this->shape);
+        for (std::size_t i = 0; i < this->size; ++i) {
+            result.data[i] = this->data[i] + other.data[i];
+        }
+        return result;
+    }
+
+    Tensor operator=(const Tensor& other) const {
+        if (this->shape != other.shape) {
+            throw std::runtime_error("Shapes do not match");
+        }
+        Tensor result(this->shape);
+        for (auto i = 0; i < this->size; i++) {
+            result.data[i] = (this->data[i] == other.data[i]); 
+        }
+        return result;
+    }
+
+    std::vector<std::size_t> get_shape() const {
+        return this->shape;
+    }
+
+    std::vector<float> get_data() const {
+        return this->data;
+    }
+
+    float get(const std::vector<std::size_t>& indices) const {
+        std::size_t index = 0;
+        std::size_t stride = 1;
+        for (int i = shape.size() - 1; i >= 0; i--) {
+            index += indices[i] * stride;
+            stride *= shape[i];
+        }
+        return data[index];
+    }
+
+    void set(const std::vector<std::size_t>& indices, float value) {
+        std::size_t index = 0;
+        std::size_t stride = 1;
+        for (int i = shape.size() - 1; i >= 0; i--) {
+            index += indices[i] * stride;
+            stride *= shape[i];
+        }
+        data[index] = value;
+    }
+    ~Tensor() = default;
+};  // class Tensor
+
+std::shared_ptr<Tensor> pyarray_to_tensor(py::array_t<float> array);
+std::shared_ptr<Tensor> argmax(const std::shared_ptr<Tensor>& tensor, int axis);
+std::shared_ptr<Tensor> mean(const std::shared_ptr<Tensor>& tensor);
+std::shared_ptr<Tensor> exp(const std::shared_ptr<Tensor>& tensor);
+}  // namespace tensor
\ No newline at end of file
diff --git a/cc/uctc.cc b/cc/uctc.cc
new file mode 100644
index 0000000..3e540dc
--- /dev/null
+++ b/cc/uctc.cc
@@ -0,0 +1,117 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include "math/arith.h"
+#include "operators/nn.h"
+#include "tensor/tensor.h"
+#include "operators/ops.h"
+#include "operators/autodiff.h"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(uctc, m) {
+
+    py::module C = m.def_submodule("C", "C module");
+
+    py::module arith = C.def_submodule("arith", "Arithmetic module");
+    arith.def("sqrt", &arith::sqrt, "Square root function", py::arg("x") = 0.0);
+
+    py::class_<tensor::Tensor, std::shared_ptr<tensor::Tensor>>(m, "Tensor")
+    .def_readonly("shape", &tensor::Tensor::shape)
+    .def_readonly("size", &tensor::Tensor::size)
+    .def("data", &tensor::Tensor::get_data, "Get the data of the tensor", pybind11::return_value_policy::copy)
+    .def("transpose", &tensor::Tensor::transpose, "Transpose the tensor", pybind11::return_value_policy::copy);
+    
+    py::module nn = m.def_submodule("nn", "Neural network module");
+    py::class_<nn::Node, std::shared_ptr<nn::Node>>(nn, "Node")
+    .def("data", &nn::Node::get_data, "Get the data of the node", pybind11::return_value_policy::copy)
+    .def("tensor", &nn::Node::get_tensor, "Get the tensor of the node", pybind11::return_value_policy::automatic_reference);
+
+    py::class_<nn::DataNode, nn::Node, std::shared_ptr<nn::DataNode>>(nn, "DataNode");
+
+    py::class_<nn::Parameter, nn::DataNode, std::shared_ptr<nn::Parameter>>(nn, "Parameter")
+    .def(pybind11::init<py::array_t<float>>(), "Create a parameter from an array.")
+    .def("update", &nn::Parameter::update, "Update the parameter node", py::arg("grad") = nullptr, py::arg("learning_rate") = 0.001);
+
+    py::class_<nn::Constant, nn::DataNode, std::shared_ptr<nn::Constant>>(nn, "Constant")
+    .def(pybind11::init<py::array_t<float>>(), "Create a constant node from a numpy array");
+
+    py::class_<nn::FunctionNode, nn::Node, std::shared_ptr<nn::FunctionNode>>(nn, "FunctionNode");
+
+    py::class_<nn::Add, nn::FunctionNode, std::shared_ptr<nn::Add>>(nn, "Add")
+    .def(py::init<std::shared_ptr<nn::Node>, std::shared_ptr<nn::Node>>(), "Create an add function node")
+    .def("forward", &nn::Add::forward, "Forward function");
+
+    py::class_<nn::AddBias, nn::FunctionNode, std::shared_ptr<nn::AddBias>>(nn, "AddBias")
+    .def(py::init<std::shared_ptr<nn::Node>, std::shared_ptr<nn::Node>>(), "Create an add bias function node")
+    .def("forward", &nn::AddBias::forward, "Forward function")
+    .def("data", &nn::AddBias::get_data, "Get the data of the node", pybind11::return_value_policy::automatic_reference);
+
+    py::class_<nn::Linear, nn::FunctionNode, std::shared_ptr<nn::Linear>>(nn, "Linear")
+    .def(py::init<std::shared_ptr<nn::Node>, std::shared_ptr<nn::Node>>(), "Create a linear function node")
+    .def("forward", &nn::Linear::forward, "Forward function");
+
+    py::class_<nn::ReLU, nn::FunctionNode, std::shared_ptr<nn::ReLU>>(nn, "ReLU")
+    .def(py::init<std::shared_ptr<nn::Node>>(), "Create a ReLU function node");
+
+    py::class_<nn::Loss, nn::FunctionNode, std::shared_ptr<nn::Loss>>(nn, "Loss");
+
+    py::class_<nn::SquareLoss, nn::Loss, std::shared_ptr<nn::SquareLoss>>(nn, "SquareLoss")
+    .def(py::init<std::shared_ptr<nn::Node>, std::shared_ptr<nn::Node>>(), "Create a square loss function node");
+    py::class_<nn::SoftmaxLoss, nn::Loss, std::shared_ptr<nn::SoftmaxLoss>>(nn, "SoftmaxLoss")
+    .def(py::init<std::shared_ptr<nn::Node>, std::shared_ptr<nn::Node>>(), "Create a softmax loss function node");
+    
+    nn.def("log_softmax", &nn::log_softmax, "Log softmax function", py::arg("logits"));
+
+    nn.def("gradients", &nn::gradients, "Calculate the gradients", py::arg("loss") = nullptr, py::arg("nodes") = std::vector<std::shared_ptr<nn::Node>>{});
+    nn.def("pyarray_to_tensor", &tensor::pyarray_to_tensor, "Convert a numpy array to a tensor", py::arg("arr"));
+    nn.def("argmax", &tensor::argmax, "Get a tensor's argmax", py::arg("tensor"), py::arg("axis"));
+    nn.def("mean", &tensor::mean, "Get a tensor element's mean value", py::arg("tensor"));
+    nn.def("exp", &tensor::exp, "Get exp of a tensor", py::arg("tensor"));
+
+    // framework test
+    py::module framework = m.def_submodule("framework", "Framework module");
+    py::module basis = framework.def_submodule("basis", "Basic modules");
+    
+    // task 1
+    basis.def("mul", &operators::mul<int>, "Multiply two integers", py::arg("a"), py::arg("b"));
+    basis.def("id", &operators::id<int>, "Identity function", py::arg("a"));
+    basis.def("add", &operators::add<int>, "Add two integers", py::arg("a"), py::arg("b"));
+    basis.def("neg", &operators::neg<int>, "Negate an integer", py::arg("a"));
+    basis.def("lt", &operators::lt<int>, "Less than operator", py::arg("a"), py::arg("b"));
+    basis.def("eq", &operators::eq<int>, "Equal operator", py::arg("a"), py::arg("b"));
+    basis.def("max", &operators::max<int>, "Max operator", py::arg("a"), py::arg("b"));
+    
+    // task 2
+    basis.def("is_close", &operators::is_close, "Check if two floats are close", py::arg("x"), py::arg("y"));
+    basis.def("sigmoid", &operators::sigmoid, "Sigmoid function", py::arg("x"));
+    basis.def("relu", &operators::relu, "ReLU function", py::arg("x"));
+    basis.def("inv", &operators::inv, "Inverse function", py::arg("x"));
+    basis.def("inv_back", &operators::inv_back, "Inv back function", py::arg("x"), py::arg("d"));
+    basis.def("relu_back", &operators::relu_back, "ReLU back function", py::arg("x"), py::arg("d"));
+
+    // task 3
+    basis.def("negList", &operators::negList, "Negate a list of integers", py::arg("lst"));
+
+    // task 4, 5
+    basis.def("addLists", &operators::addLists, "Add two lists of integers", py::arg("lst1"), py::arg("lst2"));
+
+    // task 6
+    basis.def("sumList", &operators::sumList, "Sum a list of integers", py::arg("lst"));
+
+    // task 7
+    basis.def("prodList", &operators::prodList, "Multiply a list of integers", py::arg("lst"));
+
+    py::module autodiff = framework.def_submodule("autodiff", "Autodiff modules");
+    autodiff.def("test_central_difference", &autodiff::test_central_difference, "Test central difference");
+    
+    autodiff.def("test_addscalar", &autodiff::test_addscalar, "Test add scalar");
+
+    autodiff.def("test_mulscalar", &autodiff::test_mulscalar, "Test mul scalar");
+
+    autodiff.def("test_logscalar", &autodiff::test_logscalar, "Test log scalar");
+
+    autodiff.def("test_invscalar", &autodiff::test_invscalar, "Test inv scalar");
+
+    autodiff.def("test_sigmoidscalar", &autodiff::test_sigmoidscalar, "Test sigmoid scalar");
+}
+
diff --git a/frontend/framework/autodiff/test_task7.py b/frontend/framework/autodiff/test_task7.py
new file mode 100644
index 0000000..01f1447
--- /dev/null
+++ b/frontend/framework/autodiff/test_task7.py
@@ -0,0 +1,16 @@
+from uctc.framework import autodiff
+import numpy as np
+from functools import reduce
+import random
+
+lst = [autodiff.test_central_difference, autodiff.test_addscalar, autodiff.test_mulscalar, autodiff.test_logscalar, autodiff.test_invscalar, autodiff.test_sigmoidscalar]
+for e in lst:
+    if e():
+        print(f"\033[1;34mPassed: {e.__name__} passed all tests\033[0m")
+    else:
+        print(f"\033[1;31mError: {e.__name__} failed test... expects true but gets false\033[0m")
+        exit(0)
+
+
+
+print(f"\033[1;32m[PASSED] Task 3 finished!\033[0m")
\ No newline at end of file
diff --git a/frontend/framework/basis/config.py b/frontend/framework/basis/config.py
new file mode 100644
index 0000000..4c1313e
--- /dev/null
+++ b/frontend/framework/basis/config.py
@@ -0,0 +1,2 @@
+# change this
+lib_path = "/home/hexu/learn/uc-modern-cpp-student/cc/build/"
\ No newline at end of file
diff --git a/frontend/framework/basis/test_task1.py b/frontend/framework/basis/test_task1.py
new file mode 100644
index 0000000..dc150ac
--- /dev/null
+++ b/frontend/framework/basis/test_task1.py
@@ -0,0 +1,46 @@
+import numpy as np
+import math
+from uctc.framework import basis
+binary_arguments = [
+    (1, 2),
+    (-2, 1),
+    (1, 1),
+    (2, -2),
+    (1, 3),
+    (3, 1),
+    (-3, 3),
+    (4, 5),
+    (5, 4),
+    (4, 4),
+    (5, 5)
+]
+
+singular_arguments = [
+    1, 2, 4, -32, 42, 28, 0, 100, -1000, 10000, -100000
+]
+
+def iterate_binary_arguments(func, std_func):
+    for argument in binary_arguments:
+        if func(*argument) != std_func(*argument):
+            print(f"\033[1;31mError: {func.__name__}({argument}) = {func(*argument)} != {std_func.__name__}({argument}) = {std_func(*argument)}\033[0m")
+            exit(0)
+    print(f"\033[1;34mPassed: {func.__name__} passed all tests\033[0m")
+    return True
+
+def iterate_singular_arguments(func, std_func):
+    for argument in singular_arguments:
+        if func(argument) != std_func(argument):
+            print(f"\033[1;31mError: {func.__name__}({argument}) = {func(argument)} != {std_func.__name__}({argument}) = {std_func(argument)}\033[0m")
+            exit(0)
+    print(f"\033[1;34mPassed: {func.__name__} passed all tests\033[0m")
+    return True
+
+# Test task 1
+iterate_binary_arguments(basis.mul, lambda x, y: x * y)
+iterate_singular_arguments(basis.id, lambda x: x)
+iterate_binary_arguments(basis.add, lambda x, y: x + y)
+iterate_singular_arguments(basis.neg, lambda x: -x)
+iterate_binary_arguments(basis.lt, lambda x, y: int(x < y))
+iterate_binary_arguments(basis.eq, lambda x, y: int(x == y))
+iterate_binary_arguments(basis.max, lambda x, y: max(x, y))
+print(f"\033[1;32m[PASSED] Task 1 finished!\033[0m")
\ No newline at end of file
diff --git a/frontend/framework/basis/test_task2.py b/frontend/framework/basis/test_task2.py
new file mode 100644
index 0000000..c9dfd8b
--- /dev/null
+++ b/frontend/framework/basis/test_task2.py
@@ -0,0 +1,55 @@
+from uctc.framework import basis
+import numpy as np
+import math
+
+binary_arguments = [
+    (1.0, 2.0),
+    (2.0, 1.0),
+    (-1.0, 1.0),
+    (2.0, -2.0),
+    (1.0, 3.0),
+    (3.0, -1.0),
+    (3.0, 3.0),
+    (-4.0, -5.0),
+    (5.0, 4.0),
+    (4.0, 4.0),
+    (5.0, 5.0)
+]
+
+singular_arguments = [
+    1.0, -3.2, 4.3, 5.5, -6.7, 4.8, 3.33, 2.22, 1.11
+]
+
+def is_close(x, y):
+    return abs(x - y) < 1e-5
+
+def sigmoid(x):
+    if x >= 0:
+        return 1 / (1 + math.exp(-x))
+    else:
+        return math.exp(x) / (1 + math.exp(x))
+
+def iterate_binary_arguments(func, std_func):
+    for argument in binary_arguments:
+        if not is_close(func(*argument), std_func(*argument)):
+            print(f"\033[1;31mError: {func.__name__}({argument}) = {func(*argument)} != {std_func.__name__}({argument}) = {std_func(*argument)}\033[0m")
+            exit(0)
+    print(f"\033[1;34mPassed: {func.__name__} passed all tests\033[0m")
+    return True
+
+def iterate_singular_arguments(func, std_func):
+    for argument in singular_arguments:
+        if not is_close(func(argument), std_func(argument)):
+            print(f"\033[1;31mError: {func.__name__}({argument}) = {func(argument)} != {std_func.__name__}({argument}) = {std_func(argument)}\033[0m")
+            exit(0)
+    print(f"\033[1;34mPassed: {func.__name__} passed all tests\033[0m")
+    return True
+
+# Test task 1
+iterate_binary_arguments(basis.is_close, lambda x, y: 1.0*int(is_close(x, y)))
+iterate_singular_arguments(basis.sigmoid, lambda x: sigmoid(x))
+iterate_singular_arguments(basis.relu, lambda x: x if x > 0.0 else 0.0)
+iterate_singular_arguments(basis.inv, lambda x: 1.0/x)
+iterate_binary_arguments(basis.inv_back, lambda x, d: -d/(x*x))
+iterate_binary_arguments(basis.relu_back, lambda x, d: d * 1.0 if x > 0.0 else 0.0)
+print(f"\033[1;32m[PASSED] Task 2 finished!\033[0m")
\ No newline at end of file
diff --git a/frontend/framework/basis/test_task3.py b/frontend/framework/basis/test_task3.py
new file mode 100644
index 0000000..9987e17
--- /dev/null
+++ b/frontend/framework/basis/test_task3.py
@@ -0,0 +1,20 @@
+from uctc.framework import basis
+import numpy as np
+import math
+import random
+
+def is_close(x, y):
+    return abs(x - y) < 1e-5
+
+arr = [random.random() for i in range(128)]
+
+test_x = basis.negList(arr)
+
+test_y = [-e for e in arr]
+
+for i, (x, y) in enumerate(zip(test_x, test_y)):
+    if not is_close(x, y):
+        print(f"\033[1;31mError: {basis.negList.__name__} failed test at position {i}, expects {y} but gets {x}\033[0m")
+        exit(0)
+print(f"\033[1;34mPassed: {basis.negList.__name__} passed all tests\033[0m")
+print(f"\033[1;32m[PASSED] Task 3 finished!\033[0m")
\ No newline at end of file
diff --git a/frontend/framework/basis/test_task4_5.py b/frontend/framework/basis/test_task4_5.py
new file mode 100644
index 0000000..945d3e4
--- /dev/null
+++ b/frontend/framework/basis/test_task4_5.py
@@ -0,0 +1,21 @@
+from uctc.framework import basis
+import numpy as np
+import math
+import random
+
+def is_close(x, y):
+    return abs(x - y) < 1e-5
+
+arr_a = [random.random() for i in range(128)]
+arr_b = [random.random() for i in range(128)]
+
+test_x = basis.addLists(arr_a, arr_b)
+
+test_y = [e1 + e2 for e1, e2 in zip(arr_a, arr_b)]
+
+for i, (x, y) in enumerate(zip(test_x, test_y)):
+    if not is_close(x, y):
+        print(f"\033[1;31mError: {basis.addLists.__name__} failed test at position {i}, expects {y} but gets {x}\033[0m")
+        exit(0)
+print(f"\033[1;34mPassed: {basis.addLists.__name__} passed all tests\033[0m")
+print(f"\033[1;32m[PASSED] Task 4 finished!\033[0m")
\ No newline at end of file
diff --git a/frontend/framework/basis/test_task6.py b/frontend/framework/basis/test_task6.py
new file mode 100644
index 0000000..b020353
--- /dev/null
+++ b/frontend/framework/basis/test_task6.py
@@ -0,0 +1,30 @@
+from uctc.framework import basis
+import numpy as np
+from functools import reduce
+import random
+
+def is_close(x, y):
+    return abs(x - y) < 1e-3
+
+arr = [random.random() for i in range(128)]
+
+test_x1 = basis.sumList(arr)
+
+test_x2 = basis.prodList(arr)
+
+test_y1 = reduce(lambda x, y: x + y, arr, 0.0)
+
+test_y2 = reduce(lambda x, y: x * y, arr, 1.0)
+
+
+if not is_close(test_x1, test_y1):
+    print(f"\033[1;31mError: {basis.sumList.__name__} failed test... expects {test_y1} but gets {test_x1}\033[0m")
+    exit(0)
+print(f"\033[1;34mPassed: {basis.sumList.__name__} passed all tests\033[0m")
+
+if not is_close(test_x2, test_y2):
+    print(f"\033[1;31mError: {basis.prodList.__name__} failed test... expects {test_y2} but gets {test_x2}\033[0m")
+    exit(0)
+print(f"\033[1;34mPassed: {basis.prodList.__name__} passed all tests\033[0m")
+
+print(f"\033[1;32m[PASSED] Task 3 finished!\033[0m")
\ No newline at end of file
diff --git a/frontend/framework/tensor/task13_14.py b/frontend/framework/tensor/task13_14.py
new file mode 100644
index 0000000..32e4f0c
--- /dev/null
+++ b/frontend/framework/tensor/task13_14.py
@@ -0,0 +1,41 @@
+import numpy as np
+
+import uctc.nn as nn
+
+tensor1 = np.random.rand(42, 48)
+
+tensor2 = nn.pyarray_to_tensor(tensor1)
+
+t_tensor1 = tensor1.transpose()
+
+t_tensor2 = tensor2.transpose()
+
+t_2data = t_tensor2.data()
+
+t_1data = t_tensor1.flatten().tolist()
+
+def is_close(x, y):
+    return abs(x - y) < 1e-5
+
+for i in range(len(t_1data)):
+    if not is_close(t_1data[i], t_2data[i]):
+        print(f"\033[1;31mTask 13 Error: t1 data[{i}] != t2 data[{i}]\033[0m")
+        exit(0)
+
+at2 = nn.argmax(tensor2, 0).data()
+at1 = np.argmax(tensor1, 0).flatten().tolist()
+
+for i in range(len(at1)):
+    if not is_close(at1[i], at2[i]):
+        print(f"\033[1;31mTask 14 Error: at1 data[{i}] != at2 data[{i}]\033[0m")
+        exit(0)
+
+at4 = nn.argmax(tensor2, 1).data()
+at3 = np.argmax(tensor1, 1).flatten().tolist()
+
+for i in range(len(at1)):
+    if not is_close(at1[i], at2[i]):
+        print(f"\033[1;31mTask 14 Error: at3 data[{i}] != at4 data[{i}]\033[0m")
+        exit(0)
+
+print(f"\033[1;32m[PASSED] Task 13-14 finished!\033[0m")
\ No newline at end of file
diff --git a/frontend/mnist/autofrader.py b/frontend/mnist/autofrader.py
new file mode 100644
index 0000000..3763fbe
--- /dev/null
+++ b/frontend/mnist/autofrader.py
@@ -0,0 +1,579 @@
+# A custom autograder for this project
+
+################################################################################
+# A mini-framework for autograding
+################################################################################
+
+import optparse
+import pickle
+import random
+import sys
+import traceback
+
+class WritableNull:
+    def write(self, string):
+        pass
+
+    def flush(self):
+        pass
+
+class Tracker(object):
+    def __init__(self, questions, maxes, prereqs, mute_output):
+        self.questions = questions
+        self.maxes = maxes
+        self.prereqs = prereqs
+
+        self.points = {q: 0 for q in self.questions}
+
+        self.current_question = None
+
+        self.current_test = None
+        self.points_at_test_start = None
+        self.possible_points_remaining = None
+
+        self.mute_output = mute_output
+        self.original_stdout = None
+        self.muted = False
+
+    def mute(self):
+        if self.muted:
+            return
+
+        self.muted = True
+        self.original_stdout = sys.stdout
+        sys.stdout = WritableNull()
+
+    def unmute(self):
+        if not self.muted:
+            return
+
+        self.muted = False
+        sys.stdout = self.original_stdout
+
+    def begin_q(self, q):
+        assert q in self.questions
+        text = 'Question {}'.format(q)
+        print('\n' + text)
+        print('=' * len(text))
+
+        for prereq in sorted(self.prereqs[q]):
+            if self.points[prereq] < self.maxes[prereq]:
+                print("""*** NOTE: Make sure to complete Question {} before working on Question {},
+*** because Question {} builds upon your answer for Question {}.
+""".format(prereq, q, q, prereq))
+                return False
+
+        self.current_question = q
+        self.possible_points_remaining = self.maxes[q]
+        return True
+
+    def begin_test(self, test_name):
+        self.current_test = test_name
+        self.points_at_test_start = self.points[self.current_question]
+        print("*** {}) {}".format(self.current_question, self.current_test))
+        if self.mute_output:
+            self.mute()
+
+    def end_test(self, pts):
+        if self.mute_output:
+            self.unmute()
+        self.possible_points_remaining -= pts
+        if self.points[self.current_question] == self.points_at_test_start + pts:
+            print("*** PASS: {}".format(self.current_test))
+        elif self.points[self.current_question] == self.points_at_test_start:
+            print("*** FAIL")
+
+        self.current_test = None
+        self.points_at_test_start = None
+
+    def end_q(self):
+        assert self.current_question is not None
+        assert self.possible_points_remaining == 0
+        print('\n### Question {}: {}/{} ###'.format(
+            self.current_question,
+            self.points[self.current_question],
+            self.maxes[self.current_question]))
+
+        self.current_question = None
+        self.possible_points_remaining = None
+
+    def finalize(self):
+        import time
+        print('\nFinished at %d:%02d:%02d' % time.localtime()[3:6])
+        print("\nProvisional grades\n==================")
+
+        for q in self.questions:
+          print('Question %s: %d/%d' % (q, self.points[q], self.maxes[q]))
+        print('------------------')
+        print('Total: %d/%d' % (sum(self.points.values()),
+            sum([self.maxes[q] for q in self.questions])))
+
+        print("""
+Your grades are NOT yet registered.  To register your grades, make sure
+to follow your instructor's guidelines to receive credit on your project.
+""")
+
+    def add_points(self, pts):
+        self.points[self.current_question] += pts
+
+TESTS = []
+PREREQS = {}
+def add_prereq(q, pre):
+    if isinstance(pre, str):
+        pre = [pre]
+
+    if q not in PREREQS:
+        PREREQS[q] = set()
+    PREREQS[q] |= set(pre)
+
+def test(q, points):
+    def deco(fn):
+        TESTS.append((q, points, fn))
+        return fn
+    return deco
+
+def parse_options(argv):
+    parser = optparse.OptionParser(description = 'Run public tests on student code')
+    parser.set_defaults(
+        edx_output=False,
+        gs_output=False,
+        no_graphics=False,
+        mute_output=False,
+        check_dependencies=False,
+        )
+    parser.add_option('--edx-output',
+                        dest = 'edx_output',
+                        action = 'store_true',
+                        help = 'Ignored, present for compatibility only')
+    parser.add_option('--gradescope-output',
+                        dest = 'gs_output',
+                        action = 'store_true',
+                        help = 'Ignored, present for compatibility only')
+    parser.add_option('--question', '-q',
+                        dest = 'grade_question',
+                        default = None,
+                        help = 'Grade only one question (e.g. `-q q1`)')
+    parser.add_option('--no-graphics',
+                        dest = 'no_graphics',
+                        action = 'store_true',
+                        help = 'Do not display graphics (visualizing your implementation is highly recommended for debugging).')
+    parser.add_option('--mute',
+                        dest = 'mute_output',
+                        action = 'store_true',
+                        help = 'Mute output from executing tests')
+    parser.add_option('--check-dependencies',
+                        dest = 'check_dependencies',
+                        action = 'store_true',
+                        help = 'check that numpy and matplotlib are installed')
+    (options, args) = parser.parse_args(argv)
+    return options
+
+def main():
+    options = parse_options(sys.argv)
+    if options.check_dependencies:
+        check_dependencies()
+        return
+
+    if options.no_graphics:
+        disable_graphics()
+
+    questions = set()
+    maxes = {}
+    for q, points, fn in TESTS:
+        questions.add(q)
+        maxes[q] = maxes.get(q, 0) + points
+        if q not in PREREQS:
+            PREREQS[q] = set()
+
+    questions = list(sorted(questions))
+    if options.grade_question:
+        if options.grade_question not in questions:
+            print("ERROR: question {} does not exist".format(options.grade_question))
+            sys.exit(1)
+        else:
+            questions = [options.grade_question]
+            PREREQS[options.grade_question] = set()
+
+    tracker = Tracker(questions, maxes, PREREQS, options.mute_output)
+    for q in questions:
+        started = tracker.begin_q(q)
+        if not started:
+            continue
+
+        for testq, points, fn in TESTS:
+            if testq != q:
+                continue
+            tracker.begin_test(fn.__name__)
+            try:
+                fn(tracker)
+            except KeyboardInterrupt:
+                tracker.unmute()
+                print("\n\nCaught KeyboardInterrupt: aborting autograder")
+                tracker.finalize()
+                print("\n[autograder was interrupted before finishing]")
+                sys.exit(1)
+            except:
+                tracker.unmute()
+                print(traceback.format_exc())
+            tracker.end_test(points)
+        tracker.end_q()
+    tracker.finalize()
+
+################################################################################
+# Tests begin here
+################################################################################
+
+import numpy as np
+import matplotlib
+import contextlib
+
+import nn
+import backend
+
+def check_dependencies():
+    import matplotlib.pyplot as plt
+    import time
+    fig, ax = plt.subplots(1, 1)
+    ax.set_xlim([-1, 1])
+    ax.set_ylim([-1, 1])
+    line, = ax.plot([], [], color="black")
+    plt.show(block=False)
+
+    for t in range(400):
+        angle = t * 0.05
+        x = np.sin(angle)
+        y = np.cos(angle)
+        line.set_data([x,-x], [y,-y])
+        fig.canvas.draw_idle()
+        fig.canvas.start_event_loop(1e-3)
+
+def disable_graphics():
+    backend.use_graphics = False
+
+@contextlib.contextmanager
+def no_graphics():
+    old_use_graphics = backend.use_graphics
+    backend.use_graphics = False
+    yield
+    backend.use_graphics = old_use_graphics
+
+def verify_node(node, expected_type, expected_shape, method_name):
+    if expected_type == 'parameter':
+        assert node is not None, (
+            "{} should return an instance of nn.Parameter, not None".format(method_name))
+        assert isinstance(node, nn.Parameter), (
+            "{} should return an instance of nn.Parameter, instead got type {!r}".format(
+            method_name, type(node).__name__))
+    elif expected_type == 'loss':
+        assert node is not None, (
+            "{} should return an instance a loss node, not None".format(method_name))
+        assert isinstance(node, (nn.SquareLoss, nn.SoftmaxLoss)), (
+            "{} should return a loss node, instead got type {!r}".format(
+            method_name, type(node).__name__))
+    elif expected_type == 'node':
+        assert node is not None, (
+            "{} should return a node object, not None".format(method_name))
+        assert isinstance(node, nn.Node), (
+            "{} should return a node object, instead got type {!r}".format(
+            method_name, type(node).__name__))
+    else:
+        assert False, "If you see this message, please report a bug in the autograder"
+
+    if expected_type != 'loss':
+        assert all([(expected is '?' or actual == expected) for (actual, expected) in zip(node.data.shape, expected_shape)]), (
+            "{} should return an object with shape {}, got {}".format(
+                method_name, nn.format_shape(expected_shape), nn.format_shape(node.data.shape)))
+
+def trace_node(node_to_trace):
+    """
+    Returns a set containing the node and all ancestors in the computation graph
+    """
+    nodes = set()
+    tape = []
+
+    def visit(node):
+        if node not in nodes:
+            for parent in node.parents:
+                visit(parent)
+            nodes.add(node)
+            tape.append(node)
+
+    visit(node_to_trace)
+
+    return nodes
+
+@test('q1', points=6)
+def check_perceptron(tracker):
+    import models
+
+    print("Sanity checking perceptron...")
+    np_random = np.random.RandomState(0)
+    # Check that the perceptron weights are initialized to a vector with `dimensions` entries.
+    for dimensions in range(1, 10):
+        p = models.PerceptronModel(dimensions)
+        p_weights = p.get_weights()
+        verify_node(p_weights, 'parameter', (1, dimensions), "PerceptronModel.get_weights()")
+
+    # Check that run returns a node, and that the score in the node is correct
+    for dimensions in range(1, 10):
+        p = models.PerceptronModel(dimensions)
+        p_weights = p.get_weights()
+        verify_node(p_weights, 'parameter', (1, dimensions), "PerceptronModel.get_weights()")
+        point = np_random.uniform(-10, 10, (1, dimensions))
+        score = p.run(nn.Constant(point))
+        verify_node(score, 'node', (1, 1), "PerceptronModel.run()")
+        calculated_score = nn.as_scalar(score)
+        expected_score = float(np.dot(point.flatten(), p_weights.data.flatten()))
+        assert np.isclose(calculated_score, expected_score), (
+            "The score computed by PerceptronModel.run() ({:.4f}) does not match the expected score ({:.4f})".format(
+            calculated_score, expected_score))
+
+    # Check that get_prediction returns the correct values, including the
+    # case when a point lies exactly on the decision boundary
+    for dimensions in range(1, 10):
+        p = models.PerceptronModel(dimensions)
+        random_point = np_random.uniform(-10, 10, (1, dimensions))
+        for point in (random_point, np.zeros_like(random_point)):
+            prediction = p.get_prediction(nn.Constant(point))
+            assert prediction == 1 or prediction == -1, (
+                "PerceptronModel.get_prediction() should return 1 or -1, not {}".format(
+                prediction))
+
+            expected_prediction = np.asscalar(np.where(np.dot(point, p.get_weights().data.T) >= 0, 1, -1))
+            assert prediction == expected_prediction, (
+                "PerceptronModel.get_prediction() returned {}; expected {}".format(
+                    prediction, expected_prediction))
+
+    tracker.add_points(2) # Partial credit for passing sanity checks
+
+    print("Sanity checking perceptron weight updates...")
+
+    # Test weight updates. This involves constructing a dataset that
+    # requires 0 or 1 updates before convergence, and testing that weight
+    # values change as expected. Note that (multiplier < -1 or multiplier > 1)
+    # must be true for the testing code to be correct.
+    dimensions = 2
+    for multiplier in (-5, -2, 2, 5):
+        p = models.PerceptronModel(dimensions)
+        orig_weights = p.get_weights().data.reshape((1, dimensions)).copy()
+        if np.abs(orig_weights).sum() == 0.0:
+            # This autograder test doesn't work when weights are exactly zero
+            continue
+        point = multiplier * orig_weights
+        sanity_dataset = backend.Dataset(
+            x=np.tile(point, (500, 1)),
+            y=np.ones((500, 1)) * -1.0
+        )
+        p.train(sanity_dataset)
+        new_weights = p.get_weights().data.reshape((1, dimensions))
+
+        if multiplier < 0:
+            expected_weights = orig_weights
+        else:
+            expected_weights = orig_weights - point
+
+        if not np.all(new_weights == expected_weights):
+            print()
+            print("Initial perceptron weights were: [{:.4f}, {:.4f}]".format(
+                orig_weights[0,0], orig_weights[0,1]))
+            print("All data points in the dataset were identical and had:")
+            print("    x = [{:.4f}, {:.4f}]".format(
+                point[0,0], point[0,1]))
+            print("    y = -1")
+            print("Your trained weights were: [{:.4f}, {:.4f}]".format(
+                new_weights[0,0], new_weights[0,1]))
+            print("Expected weights after training: [{:.4f}, {:.4f}]".format(
+                expected_weights[0,0], expected_weights[0,1]))
+            print()
+            assert False, "Weight update sanity check failed"
+
+    print("Sanity checking complete. Now training perceptron")
+    model = models.PerceptronModel(3)
+    dataset = backend.PerceptronDataset(model)
+
+    model.train(dataset)
+    backend.maybe_sleep_and_close(1)
+
+    assert dataset.epoch != 0, "Perceptron code never iterated over the training data"
+
+    accuracy = np.mean(np.where(np.dot(dataset.x, model.get_weights().data.T) >= 0.0, 1.0, -1.0) == dataset.y)
+    if accuracy < 1.0:
+        print("The weights learned by your perceptron correctly classified {:.2%} of training examples".format(accuracy))
+        print("To receive full points for this question, your perceptron must converge to 100% accuracy")
+        return
+
+    tracker.add_points(4)
+
+@test('q2', points=6)
+def check_regression(tracker):
+    import models
+    model = models.RegressionModel()
+    dataset = backend.RegressionDataset(model)
+
+    detected_parameters = None
+    for batch_size in (1, 2, 4):
+        inp_x = nn.Constant(dataset.x[:batch_size])
+        inp_y = nn.Constant(dataset.y[:batch_size])
+        output_node = model.run(inp_x)
+        verify_node(output_node, 'node', (batch_size, 1), "RegressionModel.run()")
+        trace = trace_node(output_node)
+        assert inp_x in trace, "Node returned from RegressionModel.run() does not depend on the provided input (x)"
+
+        if detected_parameters is None:
+            detected_parameters = [node for node in trace if isinstance(node, nn.Parameter)]
+
+        for node in trace:
+            assert not isinstance(node, nn.Parameter) or node in detected_parameters, (
+                "Calling RegressionModel.run() multiple times should always re-use the same parameters, but a new nn.Parameter object was detected")
+
+    for batch_size in (1, 2, 4):
+        inp_x = nn.Constant(dataset.x[:batch_size])
+        inp_y = nn.Constant(dataset.y[:batch_size])
+        loss_node = model.get_loss(inp_x, inp_y)
+        verify_node(loss_node, 'loss', None, "RegressionModel.get_loss()")
+        trace = trace_node(loss_node)
+        assert inp_x in trace, "Node returned from RegressionModel.get_loss() does not depend on the provided input (x)"
+        assert inp_y in trace, "Node returned from RegressionModel.get_loss() does not depend on the provided labels (y)"
+
+        for node in trace:
+            assert not isinstance(node, nn.Parameter) or node in detected_parameters, (
+                "RegressionModel.get_loss() should not use additional parameters not used by RegressionModel.run()")
+
+    tracker.add_points(2) # Partial credit for passing sanity checks
+
+    model.train(dataset)
+    backend.maybe_sleep_and_close(1)
+
+    train_loss = model.get_loss(nn.Constant(dataset.x), nn.Constant(dataset.y))
+    verify_node(train_loss, 'loss', None, "RegressionModel.get_loss()")
+    train_loss = nn.as_scalar(train_loss)
+
+    # Re-compute the loss ourselves: otherwise get_loss() could be hard-coded
+    # to always return zero
+    train_predicted = model.run(nn.Constant(dataset.x))
+    verify_node(train_predicted, 'node', (dataset.x.shape[0], 1), "RegressionModel.run()")
+    sanity_loss = 0.5 * np.mean((train_predicted.data - dataset.y)**2)
+
+    assert np.isclose(train_loss, sanity_loss), (
+        "RegressionModel.get_loss() returned a loss of {:.4f}, "
+        "but the autograder computed a loss of {:.4f} "
+        "based on the output of RegressionModel.run()".format(
+            train_loss, sanity_loss))
+
+    loss_threshold = 0.02
+    if train_loss <= loss_threshold:
+        print("Your final loss is: {:f}".format(train_loss))
+        tracker.add_points(4)
+    else:
+        print("Your final loss ({:f}) must be no more than {:.4f} to receive full points for this question".format(train_loss, loss_threshold))
+
+@test('q3', points=6)
+def check_digit_classification(tracker):
+    import models
+    model = models.DigitClassificationModel()
+    dataset = backend.DigitClassificationDataset(model)
+
+    detected_parameters = None
+    for batch_size in (1, 2, 4):
+        inp_x = nn.Constant(dataset.x[:batch_size])
+        inp_y = nn.Constant(dataset.y[:batch_size])
+        output_node = model.run(inp_x)
+        verify_node(output_node, 'node', (batch_size, 10), "DigitClassificationModel.run()")
+        trace = trace_node(output_node)
+        assert inp_x in trace, "Node returned from DigitClassificationModel.run() does not depend on the provided input (x)"
+
+        if detected_parameters is None:
+            detected_parameters = [node for node in trace if isinstance(node, nn.Parameter)]
+
+        for node in trace:
+            assert not isinstance(node, nn.Parameter) or node in detected_parameters, (
+                "Calling DigitClassificationModel.run() multiple times should always re-use the same parameters, but a new nn.Parameter object was detected")
+
+    for batch_size in (1, 2, 4):
+        inp_x = nn.Constant(dataset.x[:batch_size])
+        inp_y = nn.Constant(dataset.y[:batch_size])
+        loss_node = model.get_loss(inp_x, inp_y)
+        verify_node(loss_node, 'loss', None, "DigitClassificationModel.get_loss()")
+        trace = trace_node(loss_node)
+        assert inp_x in trace, "Node returned from DigitClassificationModel.get_loss() does not depend on the provided input (x)"
+        assert inp_y in trace, "Node returned from DigitClassificationModel.get_loss() does not depend on the provided labels (y)"
+
+        for node in trace:
+            assert not isinstance(node, nn.Parameter) or node in detected_parameters, (
+                "DigitClassificationModel.get_loss() should not use additional parameters not used by DigitClassificationModel.run()")
+
+    tracker.add_points(2) # Partial credit for passing sanity checks
+
+    model.train(dataset)
+
+    test_logits = model.run(nn.Constant(dataset.test_images)).data
+    test_predicted = np.argmax(test_logits, axis=1)
+    test_accuracy = np.mean(test_predicted == dataset.test_labels)
+
+    accuracy_threshold = 0.97
+    if test_accuracy >= accuracy_threshold:
+        print("Your final test set accuracy is: {:%}".format(test_accuracy))
+        tracker.add_points(4)
+    else:
+        print("Your final test set accuracy ({:%}) must be at least {:.0%} to receive full points for this question".format(test_accuracy, accuracy_threshold))
+
+@test('q4', points=7)
+def check_lang_id(tracker):
+    import models
+    model = models.LanguageIDModel()
+    dataset = backend.LanguageIDDataset(model)
+
+    detected_parameters = None
+    for batch_size, word_length in ((1, 1), (2, 1), (2, 6), (4, 8)):
+        start = dataset.dev_buckets[-1, 0]
+        end = start + batch_size
+        inp_xs, inp_y = dataset._encode(dataset.dev_x[start:end], dataset.dev_y[start:end])
+        inp_xs = inp_xs[:word_length]
+
+        output_node = model.run(inp_xs)
+        verify_node(output_node, 'node', (batch_size, len(dataset.language_names)), "LanguageIDModel.run()")
+        trace = trace_node(output_node)
+        for inp_x in inp_xs:
+            assert inp_x in trace, "Node returned from LanguageIDModel.run() does not depend on all of the provided inputs (xs)"
+
+        # Word length 1 does not use parameters related to transferring the
+        # hidden state across timesteps, so initial parameter detection is only
+        # run for longer words
+        if word_length > 1:
+            if detected_parameters is None:
+                detected_parameters = [node for node in trace if isinstance(node, nn.Parameter)]
+
+            for node in trace:
+                assert not isinstance(node, nn.Parameter) or node in detected_parameters, (
+                    "Calling LanguageIDModel.run() multiple times should always re-use the same parameters, but a new nn.Parameter object was detected")
+
+    for batch_size, word_length in ((1, 1), (2, 1), (2, 6), (4, 8)):
+        start = dataset.dev_buckets[-1, 0]
+        end = start + batch_size
+        inp_xs, inp_y = dataset._encode(dataset.dev_x[start:end], dataset.dev_y[start:end])
+        inp_xs = inp_xs[:word_length]
+        loss_node = model.get_loss(inp_xs, inp_y)
+        trace = trace_node(loss_node)
+        for inp_x in inp_xs:
+            assert inp_x in trace, "Node returned from LanguageIDModel.run() does not depend on all of the provided inputs (xs)"
+        assert inp_y in trace, "Node returned from LanguageIDModel.get_loss() does not depend on the provided labels (y)"
+
+        for node in trace:
+            assert not isinstance(node, nn.Parameter) or node in detected_parameters, (
+                "LanguageIDModel.get_loss() should not use additional parameters not used by LanguageIDModel.run()")
+
+    tracker.add_points(2) # Partial credit for passing sanity checks
+
+    model.train(dataset)
+
+    test_predicted_probs, test_predicted, test_correct = dataset._predict('test')
+    test_accuracy = np.mean(test_predicted == test_correct)
+    accuracy_threshold = 0.81
+    if test_accuracy >= accuracy_threshold:
+        print("Your final test set accuracy is: {:%}".format(test_accuracy))
+        tracker.add_points(5)
+    else:
+        print("Your final test set accuracy ({:%}) must be at least {:.0%} to receive full points for this question".format(test_accuracy, accuracy_threshold))
+
+if __name__ == '__main__':
+    main()
diff --git a/frontend/mnist/backend.py b/frontend/mnist/backend.py
new file mode 100644
index 0000000..0885645
--- /dev/null
+++ b/frontend/mnist/backend.py
@@ -0,0 +1,449 @@
+import collections
+import os
+import time
+import os
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+import nn
+
+use_graphics = True
+
+def maybe_sleep_and_close(seconds):
+    if use_graphics and plt.get_fignums():
+        time.sleep(seconds)
+        for fignum in plt.get_fignums():
+            fig = plt.figure(fignum)
+            plt.close(fig)
+            try:
+                # This raises a TclError on some Windows machines
+                fig.canvas.start_event_loop(1e-3)
+            except:
+                pass
+
+def get_data_path(filename):
+    path = os.path.join(
+        os.path.dirname(__file__), os.pardir, "data", filename)
+    if not os.path.exists(path):
+        path = os.path.join(
+            os.path.dirname(__file__), "data", filename)
+    if not os.path.exists(path):
+        path = os.path.join(
+            os.path.dirname(__file__), filename)
+    if not os.path.exists(path):
+        raise Exception("Could not find data file: {}".format(filename))
+    return path
+
+class Dataset(object):
+    def __init__(self, x, y):
+        assert isinstance(x, np.ndarray)
+        assert isinstance(y, np.ndarray)
+        assert np.issubdtype(x.dtype, np.floating)
+        assert np.issubdtype(y.dtype, np.floating)
+        assert x.ndim == 2
+        assert y.ndim == 2
+        assert x.shape[0] == y.shape[0]
+        self.x = x
+        self.y = y
+
+    def iterate_once(self, batch_size):
+        assert isinstance(batch_size, int) and batch_size > 0, (
+            "Batch size should be a positive integer, got {!r}".format(
+                batch_size))
+        assert self.x.shape[0] % batch_size == 0, (
+            "Dataset size {:d} is not divisible by batch size {:d}".format(
+                self.x.shape[0], batch_size))
+        index = 0
+        while index < self.x.shape[0]:
+            x = self.x[index:index + batch_size]
+            y = self.y[index:index + batch_size]
+            yield nn.Constant(x), nn.Constant(y)
+            index += batch_size
+
+    def iterate_forever(self, batch_size):
+        while True:
+            yield from self.iterate_once(batch_size)
+
+    def get_validation_accuracy(self):
+        raise NotImplementedError(
+            "No validation data is available for this dataset. "
+            "In this assignment, only the Digit Classification and Language "
+            "Identification datasets have validation data.")
+
+class PerceptronDataset(Dataset):
+    def __init__(self, model):
+        points = 500
+        x = np.hstack([np.random.randn(points, 2), np.ones((points, 1))])
+        y = np.where(x[:, 0] + 2 * x[:, 1] - 1 >= 0, 1.0, -1.0)
+        super().__init__(x, np.expand_dims(y, axis=1))
+
+        self.model = model
+        self.epoch = 0
+
+        if use_graphics:
+            fig, ax = plt.subplots(1, 1)
+            limits = np.array([-3.0, 3.0])
+            ax.set_xlim(limits)
+            ax.set_ylim(limits)
+            positive = ax.scatter(*x[y == 1, :-1].T, color="red", marker="+")
+            negative = ax.scatter(*x[y == -1, :-1].T, color="blue", marker="_")
+            line, = ax.plot([], [], color="black")
+            text = ax.text(0.03, 0.97, "", transform=ax.transAxes, va="top")
+            ax.legend([positive, negative], [1, -1])
+            plt.show(block=False)
+
+            self.fig = fig
+            self.limits = limits
+            self.line = line
+            self.text = text
+            self.last_update = time.time()
+
+    def iterate_once(self, batch_size):
+        self.epoch += 1
+
+        for i, (x, y) in enumerate(super().iterate_once(batch_size)):
+            yield x, y
+
+            if use_graphics and time.time() - self.last_update > 0.01:
+                w = self.model.get_weights().data.flatten()
+                limits = self.limits
+                if w[1] != 0:
+                    self.line.set_data(limits, (-w[0] * limits - w[2]) / w[1])
+                elif w[0] != 0:
+                    self.line.set_data(np.full(2, -w[2] / w[0]), limits)
+                else:
+                    self.line.set_data([], [])
+                self.text.set_text(
+                    "epoch: {:,}\npoint: {:,}/{:,}\nweights: {}".format(
+                        self.epoch, i * batch_size + 1, len(self.x), w))
+                self.fig.canvas.draw_idle()
+                self.fig.canvas.start_event_loop(1e-3)
+                self.last_update = time.time()
+
+class RegressionDataset(Dataset):
+    def __init__(self, model):
+        x = np.expand_dims(np.linspace(-2 * np.pi, 2 * np.pi, num=200), axis=1)
+        np.random.RandomState(0).shuffle(x)
+        self.argsort_x = np.argsort(x.flatten())
+        y = np.sin(x)
+        super().__init__(x, y)
+
+        self.model = model
+        self.processed = 0
+
+        if use_graphics:
+            fig, ax = plt.subplots(1, 1)
+            ax.set_xlim(-2 * np.pi, 2 * np.pi)
+            ax.set_ylim(-1.4, 1.4)
+            real, = ax.plot(x[self.argsort_x], y[self.argsort_x], color="blue")
+            learned, = ax.plot([], [], color="red")
+            text = ax.text(0.03, 0.97, "", transform=ax.transAxes, va="top")
+            ax.legend([real, learned], ["real", "learned"])
+            plt.show(block=False)
+
+            self.fig = fig
+            self.learned = learned
+            self.text = text
+            self.last_update = time.time()
+
+    def iterate_once(self, batch_size):
+        for x, y in super().iterate_once(batch_size):
+            yield x, y
+            self.processed += batch_size
+
+            if use_graphics and time.time() - self.last_update > 0.1:
+                predicted = self.model.run(nn.Constant(self.x)).data
+                loss = self.model.get_loss(
+                    nn.Constant(self.x), nn.Constant(self.y)).data
+                self.learned.set_data(self.x[self.argsort_x], predicted[self.argsort_x])
+                self.text.set_text("processed: {:,}\nloss: {:.6f}".format(
+                   self.processed, loss))
+                self.fig.canvas.draw_idle()
+                self.fig.canvas.start_event_loop(1e-3)
+                self.last_update = time.time()
+
+class DigitClassificationDataset(Dataset):
+    def __init__(self, model):
+        mnist_path = get_data_path("mnist.npz")
+
+        with np.load(mnist_path) as data:
+            train_images = data["train_images"]
+            train_labels = data["train_labels"]
+            test_images = data["test_images"]
+            test_labels = data["test_labels"]
+            assert len(train_images) == len(train_labels) == 60000
+            assert len(test_images) == len(test_labels) == 10000
+            self.dev_images = test_images[0::2]
+            self.dev_labels = test_labels[0::2]
+            self.test_images = test_images[1::2]
+            self.test_labels = test_labels[1::2]
+
+        train_labels_one_hot = np.zeros((len(train_images), 10))
+        train_labels_one_hot[range(len(train_images)), train_labels] = 1
+
+        super().__init__(train_images, train_labels_one_hot)
+
+        self.model = model
+        self.epoch = 0
+
+        if use_graphics:
+            width = 20  # Width of each row expressed as a multiple of image width
+            samples = 100  # Number of images to display per label
+            fig = plt.figure()
+            ax = {}
+            images = collections.defaultdict(list)
+            texts = collections.defaultdict(list)
+            for i in reversed(range(10)):
+                ax[i] = plt.subplot2grid((30, 1), (3 * i, 0), 2, 1,
+                                         sharex=ax.get(9))
+                plt.setp(ax[i].get_xticklabels(), visible=i == 9)
+                ax[i].set_yticks([])
+                ax[i].text(-0.03, 0.5, i, transform=ax[i].transAxes,
+                           va="center")
+                ax[i].set_xlim(0, 28 * width)
+                ax[i].set_ylim(0, 28)
+                for j in range(samples):
+                    images[i].append(ax[i].imshow(
+                        np.zeros((28, 28)), vmin=0, vmax=1, cmap="Greens",
+                        alpha=0.3))
+                    texts[i].append(ax[i].text(
+                        0, 0, "", ha="center", va="top", fontsize="smaller"))
+            ax[9].set_xticks(np.linspace(0, 28 * width, 11))
+            ax[9].set_xticklabels(
+                ["{:.1f}".format(num) for num in np.linspace(0, 1, 11)])
+            ax[9].tick_params(axis="x", pad=16)
+            ax[9].set_xlabel("Probability of Correct Label")
+            status = ax[0].text(
+                0.5, 1.5, "", transform=ax[0].transAxes, ha="center",
+                va="bottom")
+            plt.show(block=False)
+
+            self.width = width
+            self.samples = samples
+            self.fig = fig
+            self.images = images
+            self.texts = texts
+            self.status = status
+            self.last_update = time.time()
+
+    def iterate_once(self, batch_size):
+        self.epoch += 1
+
+        for i, (x, y) in enumerate(super().iterate_once(batch_size)):
+            yield x, y
+
+            if use_graphics and time.time() - self.last_update > 1:
+                dev_logits = self.model.run(nn.Constant(self.dev_images)).data
+                dev_predicted = np.argmax(dev_logits, axis=1)
+                dev_probs = np.exp(nn.SoftmaxLoss.log_softmax(dev_logits))
+                dev_accuracy = np.mean(dev_predicted == self.dev_labels)
+
+                self.status.set_text(
+                    "epoch: {:d}, batch: {:d}/{:d}, validation accuracy: "
+                    "{:.2%}".format(
+                        self.epoch, i, len(self.x) // batch_size, dev_accuracy))
+                for i in range(10):
+                    predicted = dev_predicted[self.dev_labels == i]
+                    probs = dev_probs[self.dev_labels == i][:, i]
+                    linspace = np.linspace(
+                        0, len(probs) - 1, self.samples).astype(int)
+                    indices = probs.argsort()[linspace]
+                    for j, (prob, image) in enumerate(zip(
+                            probs[indices],
+                            self.dev_images[self.dev_labels == i][indices])):
+                        self.images[i][j].set_data(image.reshape((28, 28)))
+                        left = prob * (self.width - 1) * 28
+                        if predicted[indices[j]] == i:
+                            self.images[i][j].set_cmap("Greens")
+                            self.texts[i][j].set_text("")
+                        else:
+                            self.images[i][j].set_cmap("Reds")
+                            self.texts[i][j].set_text(predicted[indices[j]])
+                            self.texts[i][j].set_x(left + 14)
+                        self.images[i][j].set_extent([left, left + 28, 0, 28])
+                self.fig.canvas.draw_idle()
+                self.fig.canvas.start_event_loop(1e-3)
+                self.last_update = time.time()
+
+    def get_validation_accuracy(self):
+        # print(self.dev_images[:2].tolist())
+        dev_logits = self.model.run(nn.Constant(self.dev_images)).data
+        # print(f"dev logits: {dev_logits.flatten()[10:20]}")
+        dev_predicted = np.argmax(dev_logits, axis=1)
+        dev_accuracy = np.mean(dev_predicted == self.dev_labels)
+        return dev_accuracy
+
+class LanguageIDDataset(Dataset):
+    def __init__(self, model):
+        self.model = model
+
+        data_path = get_data_path("lang_id.npz")
+
+        with np.load(data_path) as data:
+            self.chars = data['chars']
+            self.language_codes = data['language_codes']
+            self.language_names = data['language_names']
+
+            self.train_x = data['train_x']
+            self.train_y = data['train_y']
+            self.train_buckets = data['train_buckets']
+            self.dev_x = data['dev_x']
+            self.dev_y = data['dev_y']
+            self.dev_buckets = data['dev_buckets']
+            self.test_x = data['test_x']
+            self.test_y = data['test_y']
+            self.test_buckets = data['test_buckets']
+
+        self.epoch = 0
+        self.bucket_weights = self.train_buckets[:,1] - self.train_buckets[:,0]
+        self.bucket_weights = self.bucket_weights / float(self.bucket_weights.sum())
+
+        self.chars_print = self.chars
+        try:
+            print(u"Alphabet: {}".format(u"".join(self.chars)))
+        except UnicodeEncodeError:
+            self.chars_print = "abcdefghijklmnopqrstuvwxyzaaeeeeiinoouuacelnszz"
+            print("Alphabet: " + self.chars_print)
+            self.chars_print = list(self.chars_print)
+            print("""
+NOTE: Your terminal does not appear to support printing Unicode characters.
+For the purposes of printing to the terminal, some of the letters in the
+alphabet above have been substituted with ASCII symbols.""".strip())
+        print("")
+
+        # Select some examples to spotlight in the monitoring phase (3 per language)
+        spotlight_idxs = []
+        for i in range(len(self.language_names)):
+            idxs_lang_i = np.nonzero(self.dev_y == i)[0]
+            idxs_lang_i = np.random.choice(idxs_lang_i, size=3, replace=False)
+            spotlight_idxs.extend(list(idxs_lang_i))
+        self.spotlight_idxs = np.array(spotlight_idxs, dtype=int)
+
+        # Templates for printing updates as training progresses
+        max_word_len = self.dev_x.shape[1]
+        max_lang_len = max([len(x) for x in self.language_names])
+
+        self.predicted_template = u"Pred: {:<NUM}".replace('NUM',
+            str(max_lang_len))
+
+        self.word_template = u"  "
+        self.word_template += u"{:<NUM} ".replace('NUM', str(max_word_len))
+        self.word_template += u"{:<NUM} ({:6.1%})".replace('NUM', str(max_lang_len))
+        self.word_template += u" {:<NUM} ".replace('NUM',
+            str(max_lang_len + len('Pred: ')))
+        for i in range(len(self.language_names)):
+            self.word_template += u"|{}".format(self.language_codes[i])
+            self.word_template += "{probs[" + str(i) + "]:4.0%}"
+
+        self.last_update = time.time()
+
+    def _encode(self, inp_x, inp_y):
+        xs = []
+        for i in range(inp_x.shape[1]):
+            if np.all(inp_x[:,i] == -1):
+                break
+            assert not np.any(inp_x[:,i] == -1), (
+                "Please report this error in the project: batching by length was done incorrectly in the provided code")
+            x = np.eye(len(self.chars))[inp_x[:,i]]
+            xs.append(nn.Constant(x))
+        y = np.eye(len(self.language_names))[inp_y]
+        y = nn.Constant(y)
+        return xs, y
+
+    def _softmax(self, x):
+        exp = np.exp(x - np.max(x, axis=-1, keepdims=True))
+        return exp / np.sum(exp, axis=-1, keepdims=True)
+
+    def _predict(self, split='dev'):
+        if split == 'dev':
+            data_x = self.dev_x
+            data_y = self.dev_y
+            buckets = self.dev_buckets
+        else:
+            data_x = self.test_x
+            data_y = self.test_y
+            buckets = self.test_buckets
+
+        all_predicted = []
+        all_correct = []
+        for bucket_id in range(buckets.shape[0]):
+            start, end = buckets[bucket_id]
+            xs, y = self._encode(data_x[start:end], data_y[start:end])
+            predicted = self.model.run(xs)
+
+            all_predicted.extend(list(predicted.data))
+            all_correct.extend(list(data_y[start:end]))
+
+        all_predicted_probs = self._softmax(np.asarray(all_predicted))
+        all_predicted = np.asarray(all_predicted).argmax(axis=-1)
+        all_correct = np.asarray(all_correct)
+
+        return all_predicted_probs, all_predicted, all_correct
+
+    def iterate_once(self, batch_size):
+        assert isinstance(batch_size, int) and batch_size > 0, (
+            "Batch size should be a positive integer, got {!r}".format(
+                batch_size))
+        assert self.train_x.shape[0] >= batch_size, (
+            "Dataset size {:d} is smaller than the batch size {:d}".format(
+                self.train_x.shape[0], batch_size))
+
+        self.epoch += 1
+
+        for iteration in range(self.train_x.shape[0] // batch_size):
+            bucket_id = np.random.choice(self.bucket_weights.shape[0], p=self.bucket_weights)
+            example_ids = self.train_buckets[bucket_id, 0] + np.random.choice(
+                self.train_buckets[bucket_id, 1] - self.train_buckets[bucket_id, 0],
+                size=batch_size)
+
+            yield self._encode(self.train_x[example_ids], self.train_y[example_ids])
+
+            if use_graphics and time.time() - self.last_update > 0.5:
+                dev_predicted_probs, dev_predicted, dev_correct = self._predict()
+                dev_accuracy = np.mean(dev_predicted == dev_correct)
+
+                print("epoch {:,} iteration {:,} validation-accuracy {:.1%}".format(
+                    self.epoch, iteration, dev_accuracy))
+
+                for idx in self.spotlight_idxs:
+                    correct = (dev_predicted[idx] == dev_correct[idx])
+                    word = u"".join([self.chars_print[ch] for ch in self.dev_x[idx] if ch != -1])
+
+                    print(self.word_template.format(
+                        word,
+                        self.language_names[dev_correct[idx]],
+                        dev_predicted_probs[idx, dev_correct[idx]],
+                        "" if correct else self.predicted_template.format(
+                            self.language_names[dev_predicted[idx]]),
+                        probs=dev_predicted_probs[idx,:],
+                    ))
+
+                self.last_update = time.time()
+
+    def get_validation_accuracy(self):
+        dev_predicted_probs, dev_predicted, dev_correct = self._predict()
+        dev_accuracy = np.mean(dev_predicted == dev_correct)
+        return dev_accuracy
+
+
+def main():
+    import models
+    # model = models.PerceptronModel(3)
+    # dataset = PerceptronDataset(model)
+    # model.train(dataset)
+
+    # model = models.RegressionModel()
+    # dataset = RegressionDataset(model)
+    # model.train(dataset)
+
+    model = models.DigitClassificationModel()
+    dataset = DigitClassificationDataset(model)
+    model.train(dataset)
+
+    # model = models.LanguageIDModel()
+    # dataset = LanguageIDDataset(model)
+    # model.train(dataset)
+
+if __name__ == "__main__":
+    main()
diff --git a/frontend/mnist/data/lang_id.npz b/frontend/mnist/data/lang_id.npz
new file mode 100644
index 0000000..3974849
Binary files /dev/null and b/frontend/mnist/data/lang_id.npz differ
diff --git a/frontend/mnist/data/mnist.npz b/frontend/mnist/data/mnist.npz
new file mode 100644
index 0000000..abf960a
Binary files /dev/null and b/frontend/mnist/data/mnist.npz differ
diff --git a/frontend/mnist/models.py b/frontend/mnist/models.py
new file mode 100644
index 0000000..2e2a1b9
--- /dev/null
+++ b/frontend/mnist/models.py
@@ -0,0 +1,292 @@
+import nn
+
+class PerceptronModel(object):
+    def __init__(self, dimensions):
+        """
+        Initialize a new Perceptron instance.
+
+        A perceptron classifies data points as either belonging to a particular
+        class (+1) or not (-1). `dimensions` is the dimensionality of the data.
+        For example, dimensions=2 would mean that the perceptron must classify
+        2D points.
+        """
+        self.w = nn.Parameter(1, dimensions)
+
+    def get_weights(self):
+        """
+        Return a Parameter instance with the current weights of the perceptron.
+        """
+        return self.w
+
+    def run(self, x):
+        """
+        Calculates the score assigned by the perceptron to a data point x.
+
+        Inputs:
+            x: a node with shape (1 x dimensions)
+        Returns: a node containing a single number (the score)
+        """
+        "*** YOUR CODE HERE ***"
+        return nn.DotProduct(x, self.get_weights())
+
+    def get_prediction(self, x):
+        """
+        Calculates the predicted class for a single data point `x`.
+
+        Returns: 1 or -1
+        """
+        "*** YOUR CODE HERE ***"
+        score = self.run(x)
+        if nn.as_scalar(score) >= 0:
+            return 1
+        else:
+            return -1
+
+    def train(self, dataset):
+        """
+        Train the perceptron until convergence.
+        """
+        "*** YOUR CODE HERE ***"
+        batch_size = 1
+
+        while True:
+            converged = True
+            for x, y in dataset.iterate_once(batch_size):
+                prediction = self.get_prediction(x)
+                print(x, y)
+                assert 0
+                if prediction != nn.as_scalar(y):
+                    converged = False
+                    self.w.update(x, nn.as_scalar(y))
+            if converged:
+                break
+
+
+class RegressionModel(object):
+    """
+    A neural network model for approximating a function that maps from real
+    numbers to real numbers. The network should be sufficiently large to be able
+    to approximate sin(x) on the interval [-2pi, 2pi] to reasonable precision.
+    """
+    def __init__(self):
+        # Initialize your model parameters here
+        "*** YOUR CODE HERE ***"
+        self.i = 1
+        self.o = 1
+
+        self.h = 50
+        self.b = 10
+        self.learning_rate = 0.01
+
+        self.W1 = nn.Parameter(self.i, self.h)
+        self.b1 = nn.Parameter(1, self.h)
+        self.W2 = nn.Parameter(self.h, self.o)
+        self.b2 = nn.Parameter(1, self.o)
+
+    def run(self, x):
+        """
+        Runs the model for a batch of examples.
+
+        Inputs:
+            x: a node with shape (batch_size x 1)
+        Returns:
+            A node with shape (batch_size x 1) containing predicted y-values
+        """
+        "*** YOUR CODE HERE ***"
+        layer_1 = nn.ReLU(nn.AddBias(nn.Linear(x, self.W1), self.b1))
+        prediction = nn.AddBias(nn.Linear(layer_1, self.W2), self.b2)
+        return prediction
+
+    def get_loss(self, x, y):
+        """
+        Computes the loss for a batch of examples.
+
+        Inputs:
+            x: a node with shape (batch_size x 1)
+            y: a node with shape (batch_size x 1), containing the true y-values
+                to be used for training
+        Returns: a loss node
+        """
+        "*** YOUR CODE HERE ***"
+        return nn.SquareLoss(self.run(x), y)
+        
+
+    def train(self, dataset):
+        """
+        Trains the model.
+        """
+        "*** YOUR CODE HERE ***"
+        for i in range(20):
+            for x, y in dataset.iterate_once(self.b):
+                loss = self.get_loss(x, y)
+                print(loss.data)
+                g_W1, g_b1, g_W2, g_b2 = nn.gradients(loss, [self.W1, self.b1, self.W2, self.b2])
+                # print(g_W1.data)
+                # print(g_b1.data)
+                # print(g_W2.data)
+                # print(g_b2.data)
+                self.W1.update(g_W1, -self.learning_rate)
+                self.b1.update(g_b1, -self.learning_rate)
+                self.W2.update(g_W2, -self.learning_rate)
+                self.b2.update(g_b2, -self.learning_rate)
+            if loss.data < 0.01:
+                break
+        
+
+class DigitClassificationModel(object):
+    """
+    A model for handwritten digit classification using the MNIST dataset.
+
+    Each handwritten digit is a 28x28 pixel grayscale image, which is flattened
+    into a 784-dimensional vector for the purposes of this model. Each entry in
+    the vector is a floating point number between 0 and 1.
+
+    The goal is to sort each digit into one of 10 classes (number 0 through 9).
+
+    (See RegressionModel for more information about the APIs of different
+    methods here. We recommend that you implement the RegressionModel before
+    working on this part of the project.)
+    """
+    def __init__(self):
+        # Initialize your model parameters here
+        "*** YOUR CODE HERE ***"
+        self.input_features = 784
+        self.h1 = 200
+        self.h2 = 100
+        self.output_features = 10
+        self.lr = 0.01
+        self.batch_size = 100
+        self.w1 = nn.Parameter(self.input_features, self.h1)
+        self.b1 = nn.Parameter(1, self.h1)
+        self.w2 = nn.Parameter(self.h1, self.h2)
+        self.b2 = nn.Parameter(1, self.h2)
+        self.w3 = nn.Parameter(self.h2, self.output_features)
+        self.b3 = nn.Parameter(1, self.output_features)
+
+    def run(self, x):
+        """
+        Runs the model for a batch of examples.
+
+        Your model should predict a node with shape (batch_size x 10),
+        containing scores. Higher scores correspond to greater probability of
+        the image belonging to a particular class.
+
+        Inputs:
+            x: a node with shape (batch_size x 784)
+        Output:
+            A node with shape (batch_size x 10) containing predicted scores
+                (also called logits)
+        """
+        "*** YOUR CODE HERE ***"
+        l1 = nn.ReLU(nn.AddBias(nn.Linear(x, self.w1), self.b1))
+        l2 = nn.ReLU(nn.AddBias(nn.Linear(l1, self.w2), self.b2))
+        l3 = nn.AddBias(nn.Linear(l2, self.w3), self.b3)
+        return l3
+
+    def get_loss(self, x, y):
+        """
+        Computes the loss for a batch of examples.
+
+        The correct labels `y` are represented as a node with shape
+        (batch_size x 10). Each row is a one-hot vector encoding the correct
+        digit class (0-9).
+
+        Inputs:
+            x: a node with shape (batch_size x 784)
+            y: a node with shape (batch_size x 10)
+        Returns: a loss node
+        """
+        "*** YOUR CODE HERE ***"
+        return nn.SoftmaxLoss(self.run(x), y)
+
+    def train(self, dataset):
+        """
+        Trains the model.
+        """
+        "*** YOUR CODE HERE ***"
+        while True:
+            for x, y in dataset.iterate_once(self.batch_size):
+                loss = self.get_loss(x, y)
+                g_w1, g_b1, g_w2, g_b2, g_w3, g_b3 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2, self.w3, self.b3])
+                self.w1.update(g_w1, -self.lr)
+                self.b1.update(g_b1, -self.lr)
+                self.w2.update(g_w2, -self.lr)
+                self.b2.update(g_b2, -self.lr)
+                self.w3.update(g_w3, -self.lr)
+                self.b3.update(g_b3, -self.lr)
+            accuracy = dataset.get_validation_accuracy()
+            print(accuracy)
+            if accuracy > 0.95:
+                break
+
+class LanguageIDModel(object):
+    """
+    A model for language identification at a single-word granularity.
+
+    (See RegressionModel for more information about the APIs of different
+    methods here. We recommend that you implement the RegressionModel before
+    working on this part of the project.)
+    """
+    def __init__(self):
+        # Our dataset contains words from five different languages, and the
+        # combined alphabets of the five languages contain a total of 47 unique
+        # characters.
+        # You can refer to self.num_chars or len(self.languages) in your code
+        self.num_chars = 47
+        self.languages = ["English", "Spanish", "Finnish", "Dutch", "Polish"]
+
+        # Initialize your model parameters here
+        "*** YOUR CODE HERE ***"
+
+    def run(self, xs):
+        """
+        Runs the model for a batch of examples.
+
+        Although words have different lengths, our data processing guarantees
+        that within a single batch, all words will be of the same length (L).
+
+        Here `xs` will be a list of length L. Each element of `xs` will be a
+        node with shape (batch_size x self.num_chars), where every row in the
+        array is a one-hot vector encoding of a character. For example, if we
+        have a batch of 8 three-letter words where the last word is "cat", then
+        xs[1] will be a node that contains a 1 at position (7, 0). Here the
+        index 7 reflects the fact that "cat" is the last word in the batch, and
+        the index 0 reflects the fact that the letter "a" is the inital (0th)
+        letter of our combined alphabet for this task.
+
+        Your model should use a Recurrent Neural Network to summarize the list
+        `xs` into a single node of shape (batch_size x hidden_size), for your
+        choice of hidden_size. It should then calculate a node of shape
+        (batch_size x 5) containing scores, where higher scores correspond to
+        greater probability of the word originating from a particular language.
+
+        Inputs:
+            xs: a list with L elements (one per character), where each element
+                is a node with shape (batch_size x self.num_chars)
+        Returns:
+            A node with shape (batch_size x 5) containing predicted scores
+                (also called logits)
+        """
+        "*** YOUR CODE HERE ***"
+
+    def get_loss(self, xs, y):
+        """
+        Computes the loss for a batch of examples.
+
+        The correct labels `y` are represented as a node with shape
+        (batch_size x 5). Each row is a one-hot vector encoding the correct
+        language.
+
+        Inputs:
+            xs: a list with L elements (one per character), where each element
+                is a node with shape (batch_size x self.num_chars)
+            y: a node with shape (batch_size x 5)
+        Returns: a loss node
+        """
+        "*** YOUR CODE HERE ***"
+
+    def train(self, dataset):
+        """
+        Trains the model.
+        """
+        "*** YOUR CODE HERE ***"
diff --git a/frontend/mnist/nn.py b/frontend/mnist/nn.py
new file mode 100644
index 0000000..86822ed
--- /dev/null
+++ b/frontend/mnist/nn.py
@@ -0,0 +1,393 @@
+import numpy as np
+np.random.seed(42)
+def format_shape(shape):
+    return "x".join(map(str, shape)) if shape else "()"
+
+class Node(object):
+    def __repr__(self):
+        return "<{} shape={} at {}>".format(
+            type(self).__name__, format_shape(self.data.shape), hex(id(self)))
+
+class DataNode(Node):
+    """
+    DataNode is the parent class for Parameter and Constant nodes.
+
+    You should not need to use this class directly.
+    """
+    def __init__(self, data):
+        self.parents = []
+        self.data = data
+
+    def _forward(self, *inputs):
+        return self.data
+
+    @staticmethod
+    def _backward(gradient, *inputs):
+        return []
+
+class Parameter(DataNode):
+    """
+    A Parameter node stores parameters used in a neural network (or perceptron).
+
+    Use the the `update` method to update parameters when training the
+    perceptron or neural network.
+    """
+    def __init__(self, *shape):
+        assert len(shape) == 2, (
+            "Shape must have 2 dimensions, instead has {}".format(len(shape)))
+        assert all(isinstance(dim, int) and dim > 0 for dim in shape), (
+            "Shape must consist of positive integers, got {!r}".format(shape))
+        limit = np.sqrt(3.0 / np.mean(shape))
+        data = np.random.uniform(low=-limit, high=limit, size=shape)
+        super().__init__(data)
+
+    def update(self, direction, multiplier):
+        assert isinstance(direction, Constant), (
+            "Update direction must be a {} node, instead has type {!r}".format(
+                Constant.__name__, type(direction).__name__))
+        assert direction.data.shape == self.data.shape, (
+            "Update direction shape {} does not match parameter shape "
+            "{}".format(
+                format_shape(direction.data.shape),
+                format_shape(self.data.shape)))
+        assert isinstance(multiplier, (int, float)), (
+            "Multiplier must be a Python scalar, instead has type {!r}".format(
+                type(multiplier).__name__))
+        self.data += multiplier * direction.data
+        assert np.all(np.isfinite(self.data)), (
+            "Parameter contains NaN or infinity after update, cannot continue")
+
+class Constant(DataNode):
+    """
+    A Constant node is used to represent:
+    * Input features
+    * Output labels
+    * Gradients computed by back-propagation
+
+    You should not need to construct any Constant nodes directly; they will
+    instead be provided by either the dataset or when you call `nn.gradients`.
+    """
+    def __init__(self, data):
+        assert isinstance(data, np.ndarray), (
+            "Data should be a numpy array, instead has type {!r}".format(
+                type(data).__name__))
+        assert np.issubdtype(data.dtype, np.floating), (
+            "Data should be a float array, instead has data type {!r}".format(
+                data.dtype))
+        super().__init__(data)
+
+class FunctionNode(Node):
+    """
+    A FunctionNode represents a value that is computed based on other nodes.
+    The FunctionNode class performs necessary book-keeping to compute gradients.
+    """
+    def __init__(self, *parents):
+        assert all(isinstance(parent, Node) for parent in parents), (
+            "Inputs must be node objects, instead got types {!r}".format(
+                tuple(type(parent).__name__ for parent in parents)))
+        self.parents = parents
+        self.data = self._forward(*(parent.data for parent in parents))
+
+class Add(FunctionNode):
+    """
+    Adds matrices element-wise.
+
+    Usage: nn.Add(x, y)
+    Inputs:
+        x: a Node with shape (batch_size x num_features)
+        y: a Node with the same shape as x
+    Output:
+        a Node with shape (batch_size x num_features)
+    """
+    @staticmethod
+    def _forward(*inputs):
+        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
+        assert inputs[0].ndim == 2, (
+            "First input should have 2 dimensions, instead has {}".format(
+                inputs[0].ndim))
+        assert inputs[1].ndim == 2, (
+            "Second input should have 2 dimensions, instead has {}".format(
+                inputs[1].ndim))
+        assert inputs[0].shape == inputs[1].shape, (
+            "Input shapes should match, instead got {} and {}".format(
+                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
+        return inputs[0] + inputs[1]
+
+    @staticmethod
+    def _backward(gradient, *inputs):
+        assert gradient.shape == inputs[0].shape
+        return [gradient, gradient]
+
+class AddBias(FunctionNode):
+    """
+    Adds a bias vector to each feature vector
+
+    Usage: nn.AddBias(features, bias)
+    Inputs:
+        features: a Node with shape (batch_size x num_features)
+        bias: a Node with shape (1 x num_features)
+    Output:
+        a Node with shape (batch_size x num_features)
+    """
+    @staticmethod
+    def _forward(*inputs):
+        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
+        assert inputs[0].ndim == 2, (
+            "First input should have 2 dimensions, instead has {}".format(
+                inputs[0].ndim))
+        assert inputs[1].ndim == 2, (
+            "Second input should have 2 dimensions, instead has {}".format(
+                inputs[1].ndim))
+        assert inputs[1].shape[0] == 1, (
+            "First dimension of second input should be 1, instead got shape "
+            "{}".format(format_shape(inputs[1].shape)))
+        assert inputs[0].shape[1] == inputs[1].shape[1], (
+            "Second dimension of inputs should match, instead got shapes {} "
+            "and {}".format(
+                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
+        return inputs[0] + inputs[1]
+
+    @staticmethod
+    def _backward(gradient, *inputs):
+        assert gradient.shape == inputs[0].shape
+        return [gradient, np.sum(gradient, axis=0, keepdims=True)]
+
+class DotProduct(FunctionNode):
+    """
+    Batched dot product
+
+    Usage: nn.DotProduct(features, weights)
+    Inputs:
+        features: a Node with shape (batch_size x num_features)
+        weights: a Node with shape (1 x num_features)
+    Output: a Node with shape (batch_size x 1)
+    """
+    @staticmethod
+    def _forward(*inputs):
+        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
+        assert inputs[0].ndim == 2, (
+            "First input should have 2 dimensions, instead has {}".format(
+                inputs[0].ndim))
+        assert inputs[1].ndim == 2, (
+            "Second input should have 2 dimensions, instead has {}".format(
+                inputs[1].ndim))
+        assert inputs[1].shape[0] == 1, (
+            "First dimension of second input should be 1, instead got shape "
+            "{}".format(format_shape(inputs[1].shape)))
+        assert inputs[0].shape[1] == inputs[1].shape[1], (
+            "Second dimension of inputs should match, instead got shapes {} "
+            "and {}".format(
+                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
+        return np.dot(inputs[0], inputs[1].T)
+
+    @staticmethod
+    def _backward(gradient, *inputs):
+        # assert gradient.shape[0] == inputs[0].shape[0]
+        # assert gradient.shape[1] == 1
+        # return [np.dot(gradient, inputs[1]), np.dot(gradient.T, inputs[0])]
+        raise NotImplementedError(
+            "Backpropagation through DotProduct nodes is not needed in this "
+            "assignment")
+
+class Linear(FunctionNode):
+    """
+    Applies a linear transformation (matrix multiplication) to the input
+
+    Usage: nn.Linear(features, weights)
+    Inputs:
+        features: a Node with shape (batch_size x input_features)
+        weights: a Node with shape (input_features x output_features)
+    Output: a node with shape (batch_size x input_features)
+    """
+    @staticmethod
+    def _forward(*inputs):
+        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
+        assert inputs[0].ndim == 2, (
+            "First input should have 2 dimensions, instead has {}".format(
+                inputs[0].ndim))
+        assert inputs[1].ndim == 2, (
+            "Second input should have 2 dimensions, instead has {}".format(
+                inputs[1].ndim))
+        assert inputs[0].shape[1] == inputs[1].shape[0], (
+            "Second dimension of first input should match first dimension of "
+            "second input, instead got shapes {} and {}".format(
+                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
+        return np.dot(inputs[0], inputs[1])
+
+    @staticmethod
+    def _backward(gradient, *inputs):
+        assert gradient.shape[0] == inputs[0].shape[0]
+        assert gradient.shape[1] == inputs[1].shape[1]
+        return [np.dot(gradient, inputs[1].T), np.dot(inputs[0].T, gradient)]
+
+class ReLU(FunctionNode):
+    """
+    An element-wise Rectified Linear Unit nonlinearity: max(x, 0).
+    This nonlinearity replaces all negative entries in its input with zeros.
+
+    Usage: nn.ReLU(x)
+    Input:
+        x: a Node with shape (batch_size x num_features)
+    Output: a Node with the same shape as x, but no negative entries
+    """
+    @staticmethod
+    def _forward(*inputs):
+        assert len(inputs) == 1, "Expected 1 input, got {}".format(len(inputs))
+        assert inputs[0].ndim == 2, (
+            "Input should have 2 dimensions, instead has {}".format(
+                inputs[0].ndim))
+        return np.maximum(inputs[0], 0)
+
+    @staticmethod
+    def _backward(gradient, *inputs):
+        assert gradient.shape == inputs[0].shape
+        return [gradient * np.where(inputs[0] > 0, 1.0, 0.0)]
+
+class SquareLoss(FunctionNode):
+    """
+    This node first computes 0.5 * (a[i,j] - b[i,j])**2 at all positions (i,j)
+    in the inputs, which creates a (batch_size x dim) matrix. It then calculates
+    and returns the mean of all elements in this matrix.
+
+    Usage: nn.SquareLoss(a, b)
+    Inputs:
+        a: a Node with shape (batch_size x dim)
+        b: a Node with shape (batch_size x dim)
+    Output: a scalar Node (containing a single floating-point number)
+    """
+    @staticmethod
+    def _forward(*inputs):
+        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
+        assert inputs[0].ndim == 2, (
+            "First input should have 2 dimensions, instead has {}".format(
+                inputs[0].ndim))
+        assert inputs[1].ndim == 2, (
+            "Second input should have 2 dimensions, instead has {}".format(
+                inputs[1].ndim))
+        assert inputs[0].shape == inputs[1].shape, (
+            "Input shapes should match, instead got {} and {}".format(
+                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
+        return np.mean(np.square(inputs[0] - inputs[1]) / 2)
+
+    @staticmethod
+    def _backward(gradient, *inputs):
+        assert np.asarray(gradient).ndim == 0
+        return [
+            gradient * (inputs[0] - inputs[1]) / inputs[0].size,
+            gradient * (inputs[1] - inputs[0]) / inputs[0].size
+        ]
+
+class SoftmaxLoss(FunctionNode):
+    """
+    A batched softmax loss, used for classification problems.
+
+    IMPORTANT: do not swap the order of the inputs to this node!
+
+    Usage: nn.SoftmaxLoss(logits, labels)
+    Inputs:
+        logits: a Node with shape (batch_size x num_classes). Each row
+            represents the scores associated with that example belonging to a
+            particular class. A score can be an arbitrary real number.
+        labels: a Node with shape (batch_size x num_classes) that encodes the
+            correct labels for the examples. All entries must be non-negative
+            and the sum of values along each row should be 1.
+    Output: a scalar Node (containing a single floating-point number)
+    """
+    @staticmethod
+    def log_softmax(logits):
+        log_probs = logits - np.max(logits, axis=1, keepdims=True)
+        log_probs -= np.log(np.sum(np.exp(log_probs), axis=1, keepdims=True))
+        return log_probs
+
+    @staticmethod
+    def _forward(*inputs):
+        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
+        assert inputs[0].ndim == 2, (
+            "First input should have 2 dimensions, instead has {}".format(
+                inputs[0].ndim))
+        assert inputs[1].ndim == 2, (
+            "Second input should have 2 dimensions, instead has {}".format(
+                inputs[1].ndim))
+        assert inputs[0].shape == inputs[1].shape, (
+            "Input shapes should match, instead got {} and {}".format(
+                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
+        assert np.all(inputs[1] >= 0), (
+            "All entries in the labels input must be non-negative")
+        assert np.allclose(np.sum(inputs[1], axis=1), 1), (
+            "Labels input must sum to 1 along each row")
+        log_probs = SoftmaxLoss.log_softmax(inputs[0])
+        return np.mean(-np.sum(inputs[1] * log_probs, axis=1))
+
+    @staticmethod
+    def _backward(gradient, *inputs):
+        assert np.asarray(gradient).ndim == 0
+        log_probs = SoftmaxLoss.log_softmax(inputs[0])
+        return [
+            gradient * (np.exp(log_probs) - inputs[1]) / inputs[0].shape[0],
+            gradient * -log_probs / inputs[0].shape[0]
+        ]
+
+def gradients(loss, parameters):
+    """
+    Computes and returns the gradient of the loss with respect to the provided
+    parameters.
+
+    Usage: nn.gradients(loss, parameters)
+    Inputs:
+        loss: a SquareLoss or SoftmaxLoss node
+        parameters: a list (or iterable) containing Parameter nodes
+    Output: a list of Constant objects, representing the gradient of the loss
+        with respect to each provided parameter.
+    """
+
+    assert isinstance(loss, (SquareLoss, SoftmaxLoss)), (
+        "Loss must be a loss node, instead has type {!r}".format(
+            type(loss).__name__))
+    assert all(isinstance(parameter, Parameter) for parameter in parameters), (
+        "Parameters must all have type {}, instead got types {!r}".format(
+            Parameter.__name__,
+            tuple(type(parameter).__name__ for parameter in parameters)))
+    assert not hasattr(loss, "used"), (
+        "Loss node has already been used for backpropagation, cannot reuse")
+
+    loss.used = True
+
+    nodes = set()
+    tape = []
+
+    def visit(node):
+        if node not in nodes:
+            for parent in node.parents:
+                visit(parent)
+            nodes.add(node)
+            tape.append(node)
+
+    visit(loss)
+    nodes |= set(parameters)
+
+    grads = {node: np.zeros_like(node.data) for node in nodes}
+    grads[loss] = 1.0
+
+    for node in reversed(tape):
+        parent_grads = node._backward(
+            grads[node], *(parent.data for parent in node.parents))
+        for parent, parent_grad in zip(node.parents, parent_grads):
+            grads[parent] += parent_grad
+
+    return [Constant(grads[parameter]) for parameter in parameters]
+
+def as_scalar(node):
+    """
+    Returns the value of a Node as a standard Python number. This only works
+    for nodes with one element (e.g. SquareLoss and SoftmaxLoss, as well as
+    DotProduct with a batch size of 1 element).
+    """
+
+    assert isinstance(node, Node), (
+        "Input must be a node object, instead has type {!r}".format(
+            type(node).__name__))
+    assert node.data.size == 1, (
+        "Node has shape {}, cannot convert to a scalar".format(
+            format_shape(node.data.shape)))
+    node.data = node.data.flatten()
+    return node.data.tolist()[0]
diff --git a/frontend/uct/data/mnist.npz b/frontend/uct/data/mnist.npz
new file mode 100644
index 0000000..abf960a
Binary files /dev/null and b/frontend/uct/data/mnist.npz differ
diff --git a/frontend/uct/dataset.py b/frontend/uct/dataset.py
new file mode 100644
index 0000000..06f9307
--- /dev/null
+++ b/frontend/uct/dataset.py
@@ -0,0 +1,36 @@
+import collections
+import os
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+import uctc.nn as nn
+
+use_graphics = True
+
+def maybe_sleep_and_close(seconds):
+    if use_graphics and plt.get_fignums():
+        time.sleep(seconds)
+        for fignum in plt.get_fignums():
+            fig = plt.figure(fignum)
+            plt.close(fig)
+            try:
+                # This raises a TclError on some Windows machines
+                fig.canvas.start_event_loop(1e-3)
+            except:
+                pass
+
+def get_data_path(filename):
+    path = os.path.join(
+        os.path.dirname(__file__), os.pardir, "data", filename)
+    if not os.path.exists(path):
+        path = os.path.join(
+            os.path.dirname(__file__), "data", filename)
+    if not os.path.exists(path):
+        path = os.path.join(
+            os.path.dirname(__file__), filename)
+    if not os.path.exists(path):
+        raise Exception("Could not find data file: {}".format(filename))
+    return path
+
diff --git a/frontend/uct/mnist.py b/frontend/uct/mnist.py
new file mode 100644
index 0000000..e8aab63
--- /dev/null
+++ b/frontend/uct/mnist.py
@@ -0,0 +1,232 @@
+import numpy as np
+import time
+import os
+import collections
+
+import matplotlib.pyplot as plt
+import uctc.nn as nn 
+from utils import parameter_data, Dataset
+
+use_graphics = False
+
+class DigitClassificationModel(object):
+    """
+    A model for handwritten digit classification using the MNIST dataset.
+
+    Each handwritten digit is a 28x28 pixel grayscale image, which is flattened
+    into a 784-dimensional vector for the purposes of this model. Each entry in
+    the vector is a floating point number between 0 and 1.
+
+    The goal is to sort each digit into one of 10 classes (number 0 through 9).
+
+    (See RegressionModel for more information about the APIs of different
+    methods here. We recommend that you implement the RegressionModel before
+    working on this part of the project.)
+    """
+    def __init__(self):
+        # Initialize your model parameters here
+        "*** YOUR CODE HERE ***"
+        self.input_features = 784
+        self.h1 = 200
+        self.h2 = 100
+        self.output_features = 10
+        self.lr = 0.01
+        self.batch_size = 100
+        self.w1 = nn.Parameter(parameter_data(self.input_features, self.h1))
+        self.b1 = nn.Parameter(parameter_data(1, self.h1))
+        self.w2 = nn.Parameter(parameter_data(self.h1, self.h2))
+        self.b2 = nn.Parameter(parameter_data(1, self.h2))
+        self.w3 = nn.Parameter(parameter_data(self.h2, self.output_features))
+        self.b3 = nn.Parameter(parameter_data(1, self.output_features))
+
+
+    def run(self, x):
+        """
+        Runs the model for a batch of examples.
+
+        Your model should predict a node with shape (batch_size x 10),
+        containing scores. Higher scores correspond to greater probability of
+        the image belonging to a particular class.
+
+        Inputs:
+            x: a node with shape (batch_size x 784)
+        Output:
+            A node with shape (batch_size x 10) containing predicted scores
+                (also called logits)
+        """
+        "*** YOUR CODE HERE ***"
+        l1 = nn.ReLU(nn.AddBias(nn.Linear(x, self.w1), self.b1))
+        l2 = nn.ReLU(nn.AddBias(nn.Linear(l1, self.w2), self.b2))
+        l3 = nn.AddBias(nn.Linear(l2, self.w3), self.b3)
+        return l3
+
+    def get_loss(self, x, y):
+        """
+        Computes the loss for a batch of examples.
+
+        The correct labels `y` are represented as a node with shape
+        (batch_size x 10). Each row is a one-hot vector encoding the correct
+        digit class (0-9).
+
+        Inputs:
+            x: a node with shape (batch_size x 784)
+            y: a node with shape (batch_size x 10)
+        Returns: a loss node
+        """
+        "*** YOUR CODE HERE ***"
+        return nn.SoftmaxLoss(self.run(x), y)
+
+    def train(self, dataset):
+        """
+        Trains the model.
+        """
+        "*** YOUR CODE HERE ***"
+        while True:
+            for x, y in dataset.iterate_once(self.batch_size):
+                loss = self.get_loss(x, y)
+                g_w1, g_b1, g_w2, g_b2, g_w3, g_b3 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2, self.w3, self.b3])
+                self.w1.update(g_w1, self.lr)
+                self.b1.update(g_b1, self.lr)
+                self.w2.update(g_w2, self.lr)
+                self.b2.update(g_b2, self.lr)
+                self.w3.update(g_w3, self.lr)
+                self.b3.update(g_b3, self.lr)
+            accuracy = dataset.get_validation_accuracy()
+            print(accuracy)
+            if accuracy > 0.95:
+                break
+
+def get_data_path(filename):
+    path = os.path.join(
+        os.path.dirname(__file__), os.pardir, "data", filename)
+    if not os.path.exists(path):
+        path = os.path.join(
+            os.path.dirname(__file__), "data", filename)
+    if not os.path.exists(path):
+        path = os.path.join(
+            os.path.dirname(__file__), filename)
+    if not os.path.exists(path):
+        raise Exception("Could not find data file: {}".format(filename))
+    return path
+
+class DigitClassificationDataset(Dataset):
+    def __init__(self, model: DigitClassificationModel):
+        mnist_path = get_data_path("mnist.npz")
+
+        with np.load(mnist_path) as data:
+            train_images = data["train_images"]
+            train_labels = data["train_labels"]
+            test_images = data["test_images"]
+            test_labels = data["test_labels"]
+            assert len(train_images) == len(train_labels) == 60000
+            assert len(test_images) == len(test_labels) == 10000
+            self.dev_images = np.array(test_images[0::2], copy=True)
+            self.dev_labels = np.array(test_labels[0::2], copy=True)
+            self.test_images = np.array(test_images[1::2], copy=True)
+            self.test_labels = np.array(test_labels[1::2], copy=True)
+
+        train_labels_one_hot = np.zeros((len(train_images), 10))
+        train_labels_one_hot[range(len(train_images)), train_labels] = 1
+
+        super().__init__(train_images, train_labels_one_hot)
+
+        self.model = model
+        self.epoch = 0
+
+        if use_graphics:
+            width = 20  # Width of each row expressed as a multiple of image width
+            samples = 100  # Number of images to display per label
+            fig = plt.figure()
+            ax = {}
+            images = collections.defaultdict(list)
+            texts = collections.defaultdict(list)
+            for i in reversed(range(10)):
+                ax[i] = plt.subplot2grid((30, 1), (3 * i, 0), 2, 1,
+                                         sharex=ax.get(9))
+                plt.setp(ax[i].get_xticklabels(), visible=i == 9)
+                ax[i].set_yticks([])
+                ax[i].text(-0.03, 0.5, i, transform=ax[i].transAxes,
+                           va="center")
+                ax[i].set_xlim(0, 28 * width)
+                ax[i].set_ylim(0, 28)
+                for j in range(samples):
+                    images[i].append(ax[i].imshow(
+                        np.zeros((28, 28)), vmin=0, vmax=1, cmap="Greens",
+                        alpha=0.3))
+                    texts[i].append(ax[i].text(
+                        0, 0, "", ha="center", va="top", fontsize="smaller"))
+            ax[9].set_xticks(np.linspace(0, 28 * width, 11))
+            ax[9].set_xticklabels(
+                ["{:.1f}".format(num) for num in np.linspace(0, 1, 11)])
+            ax[9].tick_params(axis="x", pad=16)
+            ax[9].set_xlabel("Probability of Correct Label")
+            status = ax[0].text(
+                0.5, 1.5, "", transform=ax[0].transAxes, ha="center",
+                va="bottom")
+            plt.show(block=False)
+
+            self.width = width
+            self.samples = samples
+            self.fig = fig
+            self.images = images
+            self.texts = texts
+            self.status = status
+        self.last_update = time.time()
+
+    def iterate_once(self, batch_size):
+        self.epoch += 1
+
+        for i, (x, y) in enumerate(super().iterate_once(batch_size)):
+            yield x, y
+
+            if time.time() - self.last_update > 1:
+                dev_logits = self.model.run(nn.Constant(self.dev_images)).tensor()
+                # dev_logits = np.array(dev_logits_raw.data()).reshape(5000, 10)
+                # dev_predicted = np.argmax(dev_logits, axis=1)
+                dev_argmax = nn.argmax(dev_logits, axis=1)
+                dev_predicted = np.array(dev_argmax.data())
+                # sftmax = np.array(nn.log_softmax(nn.pyarray_to_tensor(dev_logits)).data()).reshape(5000, 10)
+                sftmax = nn.log_softmax(dev_logits)
+                dev_probs = np.array(nn.exp(sftmax).data()).reshape(5000, 10)
+                dev_accuracy = np.mean(dev_predicted == self.dev_labels)
+                print("epoch: {:d}, batch: {:d}/{:d}, validation accuracy: "
+                        "{:.2%}".format(
+                            self.epoch, i, len(self.x) // batch_size, dev_accuracy))
+                if use_graphics:
+                    self.status.set_text(
+                        "epoch: {:d}, batch: {:d}/{:d}, validation accuracy: "
+                        "{:.2%}".format(
+                            self.epoch, i, len(self.x) // batch_size, dev_accuracy))
+                    for i in range(10):
+                        predicted = dev_predicted[self.dev_labels == i]
+                        probs = dev_probs[self.dev_labels == i][:, i]
+                        linspace = np.linspace(
+                            0, len(probs) - 1, self.samples).astype(int)
+                        indices = probs.argsort()[linspace]
+                        for j, (prob, image) in enumerate(zip(
+                                probs[indices],
+                                self.dev_images[self.dev_labels == i][indices])):
+                            self.images[i][j].set_data(image.reshape((28, 28)))
+                            left = prob * (self.width - 1) * 28
+                            if predicted[indices[j]] == i:
+                                self.images[i][j].set_cmap("Greens")
+                                self.texts[i][j].set_text("")
+                            else:
+                                self.images[i][j].set_cmap("Reds")
+                                self.texts[i][j].set_text(predicted[indices[j]])
+                                self.texts[i][j].set_x(left + 14)
+                            self.images[i][j].set_extent([left, left + 28, 0, 28])
+                    self.fig.canvas.draw_idle()
+                    self.fig.canvas.start_event_loop(1e-3)
+                self.last_update = time.time()
+
+    def get_validation_accuracy(self):
+        # print(self.dev_images[:2].tolist())
+        dev_logits = self.model.run(nn.Constant(self.dev_images)).tensor()
+        dev_predicted = np.array(nn.argmax(dev_logits, axis=1).data())
+        dev_accuracy = np.mean(dev_predicted == self.dev_labels)
+        return dev_accuracy
+
+model = DigitClassificationModel()
+dataset = DigitClassificationDataset(model)
+model.train(dataset)
\ No newline at end of file
diff --git a/frontend/uct/perception.py b/frontend/uct/perception.py
new file mode 100644
index 0000000..1dcdff1
--- /dev/null
+++ b/frontend/uct/perception.py
@@ -0,0 +1,129 @@
+import numpy as np
+import time
+import os
+
+import matplotlib.pyplot as plt
+import uctc.nn as nn 
+from utils import parameter_data, Dataset
+
+use_graphics = False
+class PerceptronModel(object):
+    def __init__(self, dimensions):
+        """
+        Initialize a new Perceptron instance.
+
+        A perceptron classifies data points as either belonging to a particular
+        class (+1) or not (-1). `dimensions` is the dimensionality of the data.
+        For example, dimensions=2 would mean that the perceptron must classify
+        2D points.
+        """
+        self.w = nn.Parameter(parameter_data(dimensions, 1))
+
+    def get_weights(self):
+        """
+        Return a Parameter instance with the current weights of the perceptron.
+        """
+        return self.w.data()
+
+    def run(self, x):
+        """
+        Calculates the score assigned by the perceptron to a data point x.
+
+        Inputs:
+            x: a node with shape (1 x dimensions)
+        Returns: a node containing a single number (the score)
+        """
+        "*** YOUR CODE HERE ***"
+        out = nn.Linear(x, self.w)
+        return out
+
+    def get_prediction(self, x):
+        """
+        Calculates the predicted class for a single data point `x`.
+
+        Returns: 1 or -1
+        """
+        "*** YOUR CODE HERE ***"
+        score = self.run(x).data()[0]
+        # score = np.array(x.data()).dot(np.array(self.w.data()))
+        if score >= 0:
+            return 1
+        else:
+            return -1
+
+
+    def train(self, dataset):
+        """
+        Train the perceptron until convergence.
+        """
+        "*** YOUR CODE HERE ***"
+        batch_size = 1
+
+        while True:
+            converged = True
+            for x, y in dataset.iterate_once(batch_size):
+                prediction = self.get_prediction(x)
+                x = np.array(x.data(), dtype=np.float32)
+                y = int(y.data()[0])
+                # assert 0
+                if prediction != y:
+                    # print(prediction, y)
+                    converged = False
+                    self.w.update(nn.pyarray_to_tensor(x), -y)
+                # time.sleep(0.01)
+            if converged:
+                break
+
+class PerceptronDataset(Dataset):
+    def __init__(self, model: PerceptronModel):
+        points = 500
+        x = np.hstack([np.random.randn(points, 2), np.ones((points, 1))])
+        y = np.where(x[:, 0] + 2 * x[:, 1] - 1 >= 0, 1.0, -1.0)
+        super().__init__(x, np.expand_dims(y, axis=1))
+
+        self.model = model
+        self.epoch = 0
+        limits = np.array([-3.0, 3.0])
+        if use_graphics:
+            fig, ax = plt.subplots(1, 1)
+            ax.set_xlim(limits)
+            ax.set_ylim(limits)
+            positive = ax.scatter(*x[y == 1, :-1].T, color="red", marker="+")
+            negative = ax.scatter(*x[y == -1, :-1].T, color="blue", marker="_")
+            line, = ax.plot([], [], color="black")
+            text = ax.text(0.03, 0.97, "", transform=ax.transAxes, va="top")
+            ax.legend([positive, negative], [1, -1])
+            plt.show(block=False)
+
+            self.fig = fig
+            self.line = line
+            self.text = text
+        self.limits = limits
+        self.last_update = time.time()
+
+    def iterate_once(self, batch_size):
+        self.epoch += 1
+
+        for i, (x, y) in enumerate(super().iterate_once(batch_size)):
+            yield x, y
+
+            if time.time() - self.last_update > 0.001:
+                w = self.model.get_weights()
+                limits = self.limits
+                print(f"epoch: {self.epoch}\npoint: {i * batch_size + 1}/{len(self.x)}\nweights: {w}")
+                if use_graphics:
+                    if w[1] != 0:
+                        self.line.set_data(limits, (-w[0] * limits - w[2]) / w[1])
+                    elif w[0] != 0:
+                        self.line.set_data(np.full(2, -w[2] / w[0]), limits)
+                    else:
+                        self.line.set_data([], [])
+                        self.text.set_text(
+                                f"epoch: {self.epoch}\npoint: {i * batch_size + 1}/{len(self.x)}\nweights: {w}")
+                        self.fig.canvas.draw_idle()
+                        self.fig.canvas.start_event_loop(1e-3)
+                self.last_update = time.time()
+
+model = PerceptronModel(3)
+dataset = PerceptronDataset(model)
+model.train(dataset)
\ No newline at end of file
diff --git a/frontend/uct/regression.py b/frontend/uct/regression.py
new file mode 100644
index 0000000..a03fec3
--- /dev/null
+++ b/frontend/uct/regression.py
@@ -0,0 +1,141 @@
+import numpy as np
+np.random.seed(42)
+import time
+import os
+
+import matplotlib.pyplot as plt
+import uctc.nn as nn 
+from utils import parameter_data, Dataset
+
+use_graphics = False
+
+class RegressionModel(object):
+    """
+    A neural network model for approximating a function that maps from real
+    numbers to real numbers. The network should be sufficiently large to be able
+    to approximate sin(x) on the interval [-2pi, 2pi] to reasonable precision.
+    """
+    def __init__(self):
+        # Initialize your model parameters here
+        self.batch_size = 10
+        self.input_features = 1
+        self.output_features = 1
+        self.hidden_f1 = 50
+        self.lr = 0.01
+        self.w1 = nn.Parameter(parameter_data(self.input_features, self.hidden_f1))
+        self.b1 = nn.Parameter(parameter_data(1, self.hidden_f1))
+        self.w2 = nn.Parameter(parameter_data(self.hidden_f1, self.output_features))
+        self.b2 = nn.Parameter(parameter_data(1, self.output_features))
+
+    def run(self, x):
+        """
+        Runs the model for a batch of examples.
+
+        Inputs:
+            x: a node with shape (batch_size x 1)
+        Returns:
+            A node with shape (batch_size x 1) containing predicted y-values
+        """
+        "*** YOUR CODE HERE ***"
+        # uctc
+        linear1 = nn.Linear(x, self.w1)
+        bias1 = nn.AddBias(linear1, self.b1)
+        act1 = nn.ReLU(bias1)
+        linear2 = nn.Linear(act1, self.w2)
+        bias2 = nn.AddBias(linear2, self.b2)
+
+        # numpy
+        # print(len(x.data()))
+        _x = np.array(x.data()).reshape(-1, 1)
+        _w1 = np.array(self.w1.data()).reshape(self.input_features, -1)
+        _b1 = np.array(self.b1.data()).reshape(1, -1)
+        _w2 = np.array(self.w2.data()).reshape(self.hidden_f1, -1)
+        _b2 = np.array(self.b2.data()).reshape(1, -1)
+
+        _linear1 = np.dot(_x, _w1) + _b1
+        _act1 = np.maximum(0.0, _linear1)
+        _linear2 = np.dot(_act1, _w2) + _b2
+        
+        return bias2
+
+    def get_loss(self, x, y):
+        """
+        Computes the loss for a batch of examples.
+
+        Inputs:
+            x: a node with shape (batch_size x 1)
+            y: a node with shape (batch_size x 1), containing the true y-values
+                to be used for training
+        Returns: a loss node
+        """
+        "*** YOUR CODE HERE ***"
+        predict_y = self.run(x)
+        return nn.SquareLoss(predict_y, y)
+
+    def train(self, dataset):
+        """
+        Trains the model.
+        """
+        "*** YOUR CODE HERE ***"
+        itera = 0
+        while True:
+            for x, y in dataset.iterate_once(self.batch_size):
+                loss = self.get_loss(x, y)
+                g_w1, g_b1, g_w2, g_b2 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
+                self.w1.update(g_w1, self.lr)
+                self.b1.update(g_b1, self.lr)
+                self.w2.update(g_w2, self.lr)
+                self.b2.update(g_b2, self.lr)
+                itera += 1
+            if loss.data()[0] < 0.01:
+                break
+            
+    
+class RegressionDataset(Dataset):
+    def __init__(self, model: RegressionModel):
+        x = np.expand_dims(np.linspace(-2 * np.pi, 2 * np.pi, num=200), axis=1)
+        np.random.RandomState(0).shuffle(x)
+        self.argsort_x = np.argsort(x.flatten())
+        y = np.sin(x)
+        super().__init__(x, y)
+
+        self.model = model
+        self.processed = 0
+
+        if use_graphics:
+            fig, ax = plt.subplots(1, 1)
+            ax.set_xlim(-2 * np.pi, 2 * np.pi)
+            ax.set_ylim(-1.4, 1.4)
+            real, = ax.plot(x[self.argsort_x], y[self.argsort_x], color="blue")
+            learned, = ax.plot([], [], color="red")
+            text = ax.text(0.03, 0.97, "", transform=ax.transAxes, va="top")
+            ax.legend([real, learned], ["real", "learned"])
+            plt.show(block=False)
+
+            self.fig = fig
+            self.learned = learned
+            self.text = text
+        self.last_update = time.time()
+
+    def iterate_once(self, batch_size):
+        for x, y in super().iterate_once(batch_size):
+            yield x, y
+            self.processed += batch_size
+
+            if time.time() - self.last_update > 0.01:
+                predicted = self.model.run(nn.Constant(self.x)).data()
+                loss = self.model.get_loss(
+                    x, y).data()
+                predicted = np.array(predicted)
+                loss = loss[0]
+                print(f"processed: {self.processed}\nloss: {loss: .6f}")
+                if use_graphics:
+                    self.learned.set_data(self.x[self.argsort_x], predicted[self.argsort_x])
+                    self.text.set_text(f"processed: {self.processed}\nloss: {loss: .6f}")
+                    self.fig.canvas.draw_idle()
+                    self.fig.canvas.start_event_loop(1e-3)
+                self.last_update = time.time()
+
+model = RegressionModel()
+dataset = RegressionDataset(model)
+model.train(dataset)
\ No newline at end of file
diff --git a/frontend/uct/test/01_addbias_test.py b/frontend/uct/test/01_addbias_test.py
new file mode 100644
index 0000000..59a69bb
--- /dev/null
+++ b/frontend/uct/test/01_addbias_test.py
@@ -0,0 +1,72 @@
+import uctc.nn as nn
+import std_model as stdnn
+import numpy as np
+
+class LinearTestModel:
+    def __init__(self, output_features):
+        self.b1 = nn.Parameter([1, output_features])
+    
+    def forward(self, x):
+        l2 = nn.AddBias(x, self.b1)
+        return l2
+    
+    def get_loss(self, x, y):
+        return nn.SquareLoss(self.forward(x), y)
+    
+    def backward(self, x, y):
+        loss = self.get_loss(x, y)
+        g_b1 = nn.gradients(loss, [self.b1])[0]
+        return g_b1.data()
+
+class StdLinerTestModel:
+    def __init__(self, output_features, tmodel: LinearTestModel):
+        self.b1 = stdnn.Parameter(1, output_features)
+        self.b1.data = np.array(tmodel.b1.data()).reshape(1, output_features)
+
+    def forward(self, x):
+        l2 = stdnn.AddBias(x, self.b1)
+        return l2
+    
+    def get_loss(self, x, y):
+        return stdnn.SquareLoss(self.forward(x), y)
+    
+    def backward(self, x, y):
+        loss = self.get_loss(x, y)
+        g_b1 = stdnn.gradients(loss, [self.b1])[0]
+        return g_b1.data.flatten().tolist()
+
+output_features = 32
+batch_size = 4
+x = np.random.randn(batch_size, output_features).astype(np.float32)
+y = np.random.randn(batch_size, output_features).astype(np.float32)
+
+model = LinearTestModel(output_features)
+test_x = nn.Constant(x)
+predict_y = model.forward(test_x).data()
+test_y = nn.Constant(y)
+loss = model.get_loss(test_x, test_y).data()
+g_b1 = model.backward(test_x, test_y)
+
+stdmodel = StdLinerTestModel(output_features, model)
+std_test_x = stdnn.Constant(x)
+std_predict_y = stdmodel.forward(std_test_x)
+std_test_y = stdnn.Constant(y)
+std_loss = stdmodel.get_loss(std_test_x, std_test_y)
+std_g_b1 = stdmodel.backward(std_test_x, std_test_y)
+
+# check forward
+for x, y in zip(predict_y, std_predict_y.data.tolist()[0]):
+    if (abs(x-y) > 1e-4):
+        assert 0, "Forward data mismatch!"
+
+# check loss
+if abs(loss[0] - std_loss.data) > 1e-4:
+    assert 0, "Loss mismatch!"
+
+# check backward
+for i, (x, y) in enumerate(zip(g_b1, std_g_b1)):
+    if (abs(x-y) > 1e-4):
+        assert 0, f"Gradient b1 mismatch at position {i}, g_b1 is {x} while std g_b1 is {y}"
+
+
+print("Test passed")
\ No newline at end of file
diff --git a/frontend/uct/test/02_linear_test.py b/frontend/uct/test/02_linear_test.py
new file mode 100644
index 0000000..574a04a
--- /dev/null
+++ b/frontend/uct/test/02_linear_test.py
@@ -0,0 +1,81 @@
+import uctc.nn as nn
+import std_model as stdnn
+import numpy as np
+
+class LinearTestModel:
+    def __init__(self, input_features, output_features):
+        self.w1 = nn.Parameter([input_features, output_features])
+        self.b1 = nn.Parameter([1, output_features])
+    
+    def forward(self, x):
+        l1 = nn.Linear(x, self.w1)
+        l2 = nn.AddBias(l1, self.b1)
+        return l2
+    
+    def get_loss(self, x, y):
+        return nn.SquareLoss(self.forward(x), y)
+    
+    def backward(self, x, y):
+        loss = self.get_loss(x, y)
+        g_w1, g_b1 = nn.gradients(loss, [self.w1, self.b1])
+        return g_w1.data(), g_b1.data()
+
+class StdLinerTestModel:
+    def __init__(self, input_features, output_features, tmodel: LinearTestModel):
+        self.w1 = stdnn.Parameter(input_features, output_features)
+        self.b1 = stdnn.Parameter(1, output_features)
+        self.w1.data = np.array(tmodel.w1.data()).reshape(input_features, output_features)
+        self.b1.data = np.array(tmodel.b1.data()).reshape(1, output_features)
+
+    def forward(self, x):
+        l1 = stdnn.Linear(x, self.w1)
+        l2 = stdnn.AddBias(l1, self.b1)
+        return l2
+    
+    def get_loss(self, x, y):
+        return stdnn.SquareLoss(self.forward(x), y)
+    
+    def backward(self, x, y):
+        loss = self.get_loss(x, y)
+        g_w1, g_b1 = stdnn.gradients(loss, [self.w1, self.b1])
+        return g_w1.data.flatten().tolist(), g_b1.data.flatten().tolist()
+
+input_features = 16
+output_features = 32
+batch_size = 4
+x = np.random.randn(batch_size, input_features).astype(np.float32)
+y = np.random.randn(batch_size, output_features).astype(np.float32)
+
+model = LinearTestModel(input_features, output_features)
+test_x = nn.Constant(x)
+predict_y = model.forward(test_x).data()
+test_y = nn.Constant(y)
+loss = model.get_loss(test_x, test_y).data()
+g_w1, g_b1 = model.backward(test_x, test_y)
+
+stdmodel = StdLinerTestModel(input_features, output_features, model)
+std_test_x = stdnn.Constant(x)
+std_predict_y = stdmodel.forward(std_test_x)
+std_test_y = stdnn.Constant(y)
+std_loss = stdmodel.get_loss(std_test_x, std_test_y)
+std_g_w1, std_g_b1 = stdmodel.backward(std_test_x, std_test_y)
+
+# check forward
+for x, y in zip(predict_y, std_predict_y.data.tolist()[0]):
+    if (abs(x-y) > 1e-4):
+        assert 0, "Forward data mismatch!"
+
+# check loss
+if abs(loss[0] - std_loss.data) > 1e-4:
+    assert 0, "Loss mismatch!"
+
+# check backward
+for i, (x, y) in enumerate(zip(g_w1, std_g_w1)):
+    if (abs(x-y) > 1e-4):
+        assert 0, f"Gradient w1 mismatch at position {i}, g_w1 is {x} while std g_w1 is {y}"
+for i, (x, y) in enumerate(zip(g_b1, std_g_b1)):
+    if (abs(x-y) > 1e-4):
+        assert 0, f"Gradient b1 mismatch at position {i}, g_b1 is {x} while std g_b1 is {y}"
+
+
+print("Test passed")
\ No newline at end of file
diff --git a/frontend/uct/test/03_relu_test.py b/frontend/uct/test/03_relu_test.py
new file mode 100644
index 0000000..6d47abe
--- /dev/null
+++ b/frontend/uct/test/03_relu_test.py
@@ -0,0 +1,83 @@
+import uctc.nn as nn
+import std_model as stdnn
+import numpy as np
+
+class LinearTestModel:
+    def __init__(self, input_features, output_features):
+        self.w1 = nn.Parameter([input_features, output_features])
+        self.b1 = nn.Parameter([1, output_features])
+    
+    def forward(self, x):
+        l1 = nn.Linear(x, self.w1)
+        l2 = nn.AddBias(l1, self.b1)
+        l3 = nn.ReLU(l2)
+        return l3
+    
+    def get_loss(self, x, y):
+        return nn.SquareLoss(self.forward(x), y)
+    
+    def backward(self, x, y):
+        loss = self.get_loss(x, y)
+        g_w1, g_b1 = nn.gradients(loss, [self.w1, self.b1])
+        return g_w1.data(), g_b1.data()
+
+class StdLinerTestModel:
+    def __init__(self, input_features, output_features, tmodel: LinearTestModel):
+        self.w1 = stdnn.Parameter(input_features, output_features)
+        self.b1 = stdnn.Parameter(1, output_features)
+        self.w1.data = np.array(tmodel.w1.data()).reshape(input_features, output_features)
+        self.b1.data = np.array(tmodel.b1.data()).reshape(1, output_features)
+
+    def forward(self, x):
+        l1 = stdnn.Linear(x, self.w1)
+        l2 = stdnn.AddBias(l1, self.b1)
+        l3 = stdnn.ReLU(l2)
+        return l3
+    
+    def get_loss(self, x, y):
+        return stdnn.SquareLoss(self.forward(x), y)
+    
+    def backward(self, x, y):
+        loss = self.get_loss(x, y)
+        g_w1, g_b1 = stdnn.gradients(loss, [self.w1, self.b1])
+        return g_w1.data.flatten().tolist(), g_b1.data.flatten().tolist()
+
+input_features = 16
+output_features = 32
+batch_size = 4
+x = np.random.randn(batch_size, input_features).astype(np.float32)
+y = np.random.randn(batch_size, output_features).astype(np.float32)
+
+model = LinearTestModel(input_features, output_features)
+test_x = nn.Constant(x)
+predict_y = model.forward(test_x).data()
+test_y = nn.Constant(y)
+loss = model.get_loss(test_x, test_y).data()
+g_w1, g_b1 = model.backward(test_x, test_y)
+
+stdmodel = StdLinerTestModel(input_features, output_features, model)
+std_test_x = stdnn.Constant(x)
+std_predict_y = stdmodel.forward(std_test_x)
+std_test_y = stdnn.Constant(y)
+std_loss = stdmodel.get_loss(std_test_x, std_test_y)
+std_g_w1, std_g_b1 = stdmodel.backward(std_test_x, std_test_y)
+
+# check forward
+for x, y in zip(predict_y, std_predict_y.data.tolist()[0]):
+    if (abs(x-y) > 1e-4):
+        assert 0, "Forward data mismatch!"
+
+# check loss
+if abs(loss[0] - std_loss.data) > 1e-4:
+    assert 0, "Loss mismatch!"
+
+# check backward
+for i, (x, y) in enumerate(zip(g_w1, std_g_w1)):
+    if (abs(x-y) > 1e-4):
+        assert 0, f"Gradient w1 mismatch at position {i}, g_w1 is {x} while std g_w1 is {y}"
+for i, (x, y) in enumerate(zip(g_b1, std_g_b1)):
+    if (abs(x-y) > 1e-4):
+        assert 0, f"Gradient b1 mismatch at position {i}, g_b1 is {x} while std g_b1 is {y}"
+
+
+print("Test passed")
\ No newline at end of file
diff --git a/frontend/uct/test/04_2layers_test.py b/frontend/uct/test/04_2layers_test.py
new file mode 100644
index 0000000..19d89a8
--- /dev/null
+++ b/frontend/uct/test/04_2layers_test.py
@@ -0,0 +1,144 @@
+import uctc.nn as nn
+import std_model as stdnn
+import numpy as np
+np.random.seed(42)
+class LinearTestModel:
+    def __init__(self, input_features, hidden_features, output_features):
+        self.w1 = nn.Parameter([input_features, hidden_features])
+        self.b1 = nn.Parameter([1, hidden_features])
+        self.w2 = nn.Parameter([hidden_features, output_features])
+        self.b2 = nn.Parameter([1, output_features])
+    
+    def forward(self, x):
+        l1 = nn.Linear(x, self.w1)
+        l2 = nn.AddBias(l1, self.b1)
+        l3 = nn.ReLU(l2)
+        l4 = nn.Linear(l3, self.w2)
+        l5 = nn.AddBias(l4, self.b2)
+        return l5
+    
+    def get_loss(self, x, y):
+        return nn.SquareLoss(self.forward(x), y)
+    
+    def backward(self, x, y):
+        loss = self.get_loss(x, y)
+        g_w1, g_b1, g_w2, g_b2 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
+        return g_w1.data(), g_b1.data(), g_w2.data(), g_b2.data()
+    
+    def update(self, x, y, lr):
+        loss = self.get_loss(x, y)
+        g_w1, g_b1, g_w2, g_b2 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
+        self.w1.update(g_w1, lr)
+        self.b1.update(g_b1, lr)
+        self.w2.update(g_w2, lr)
+        self.b2.update(g_b2, lr)
+        print(g_w1.data())
+        print(g_b1.data())
+        print(g_w2.data())
+        print(g_b2.data())
+        return self.w1.data(), self.b1.data(), self.w2.data(), self.b2.data()
+
+
+class StdLinerTestModel:
+    def __init__(self, input_features, hidden_features, output_features, tmodel: LinearTestModel):
+        self.w1 = stdnn.Parameter(input_features, hidden_features)
+        self.b1 = stdnn.Parameter(1, hidden_features)
+        self.w2 = stdnn.Parameter(hidden_features, output_features)
+        self.b2 = stdnn.Parameter(1, output_features)
+        self.w1.data = np.array(tmodel.w1.data()).reshape(input_features, hidden_features)
+        self.b1.data = np.array(tmodel.b1.data()).reshape(1, hidden_features)
+        self.w2.data = np.array(tmodel.w2.data()).reshape(hidden_features, output_features)
+        self.b2.data = np.array(tmodel.b2.data()).reshape(1, output_features)
+        
+
+    def forward(self, x):
+        l1 = stdnn.Linear(x, self.w1)
+        l2 = stdnn.AddBias(l1, self.b1)
+        l3 = stdnn.ReLU(l2)
+        l4 = stdnn.Linear(l3, self.w2)
+        l5 = stdnn.AddBias(l4, self.b2)
+        return l5
+    
+    def get_loss(self, x, y):
+        return stdnn.SquareLoss(self.forward(x), y)
+    
+    def backward(self, x, y):
+        loss = self.get_loss(x, y)
+        g_w1, g_b1, g_w2, g_b2 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
+        return g_w1.data.flatten().tolist(), g_b1.data.flatten().tolist(), g_w2.data.flatten().tolist(), g_b2.data.flatten().tolist()
+    
+    def update(self, x, y, lr):
+        loss = self.get_loss(x, y)
+        g_w1, g_b1, g_w2, g_b2 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
+        self.w1.update(g_w1, -lr)
+        self.b1.update(g_b1, -lr)
+        self.w2.update(g_w2, -lr)
+        self.b2.update(g_b2, -lr)
+        return self.w1.data.flatten().tolist(), self.b1.data.flatten().tolist(), self.w2.data.flatten().tolist(), self.b2.data.flatten().tolist()
+
+input_features = 1
+hidden_features = 50
+output_features = 1
+batch_size = 10
+x = np.array([-5.146528720855713, 4.451905250549316, 0.4736069440841675, -0.09472138434648514, 4.8939385414123535, 5.209676265716553, -5.967447280883789, 2.9363629817962646, -5.525413990020752, 3.315248489379883]).reshape(batch_size, -1)
+y = np.array([0.9072322249412537, -0.9662654995918274, 0.45609915256500244, -0.09457980841398239, -0.9835651516914368, -0.8788799047470093, 0.3105180263519287, 0.2037920206785202, 0.6873041391372681, -0.17278438806533813]).reshape(batch_size, -1)
+
+model = LinearTestModel(input_features, hidden_features, output_features)
+stdmodel = StdLinerTestModel(input_features, hidden_features, output_features, model)
+
+test_x = nn.Constant(x)
+predict_y = model.forward(test_x).data()
+test_y = nn.Constant(y)
+loss = model.get_loss(test_x, test_y).data()
+g_w1, g_b1, g_w2, g_b2 = model.backward(test_x, test_y)
+new_w1, new_b1, new_w2, new_b2 = model.update(test_x, test_y, 0)
+
+
+std_test_x = stdnn.Constant(x)
+std_predict_y = stdmodel.forward(std_test_x)
+std_test_y = stdnn.Constant(y)
+std_loss = stdmodel.get_loss(std_test_x, std_test_y)
+std_g_w1, std_g_b1, std_g_w2, std_g_b2 = stdmodel.backward(std_test_x, std_test_y)
+std_new_w1, std_new_b1, std_new_w2, std_new_b2 = stdmodel.update(std_test_x, std_test_y, 0)
+
+# print(predict_y)
+# print()
+# print(std_predict_y.data.flatten().tolist())
+# check forward
+for x, y in zip(predict_y, std_predict_y.data.flatten().tolist()):
+    if (abs(x-y) > 1e-4):
+        assert 0, "Forward data mismatch!"
+
+# print(loss, std_loss.data)
+# check loss
+if abs(loss[0] - std_loss.data) > 1e-4:
+    assert 0, "Loss mismatch!"
+
+# check backward
+for i, (x, y) in enumerate(zip(g_w1, std_g_w1)):
+    if (abs(x-y) > 1e-4):
+        assert 0, f"Gradient w1 mismatch at position {i}, g_w1 is {x} while std g_w1 is {y}"
+for i, (x, y) in enumerate(zip(g_b1, std_g_b1)):
+    if (abs(x-y) > 1e-4):
+        assert 0, f"Gradient b1 mismatch at position {i}, g_b1 is {x} while std g_b1 is {y}"
+for i, (x, y) in enumerate(zip(g_w2, std_g_w2)):
+    if (abs(x-y) > 1e-4):
+        assert 0, f"Gradient w2 mismatch at position {i}, g_w2 is {x} while std g_w2 is {y}"
+for i, (x, y) in enumerate(zip(g_b2, std_g_b2)):
+    if (abs(x-y) > 1e-4):
+        assert 0, f"Gradient b2 mismatch at position {i}, g_b2 is {x} while std g_b2 is {y}"
+
+# check update
+for i, (x, y) in enumerate(zip(new_b1, std_new_b1)):
+    if (abs(x-y) > 1e-4):
+        assert 0, f"Updated b1 mismatch at position {i}, new_b1 is {x} while std new_b1 is {y}"
+for i, (x, y) in enumerate(zip(new_w1, std_new_w1)):
+    if (abs(x-y) > 1e-4):
+        assert 0, f"Updated w1 mismatch at position {i}, new_w1 is {x} while std new_w1 is {y}"
+# for i, (x, y) in enumerate(zip(new_b2, std_new_b2)):
+#     if (abs(x-y) > 1e-4):
+#         assert 0, f"Updated b2 mismatch at position {i}, new_b2 is {x} while std new_b2 is {y}"
+# for i, (x, y) in enumerate(zip(new_w2, std_new_w2)):
+#     if (abs(x-y) > 1e-4):
+#         assert 0, f"Updated w2 mismatch at position {i}, new_w2 is {x} while std new_w2 is {y}"
+print("Test passed")
\ No newline at end of file
diff --git a/frontend/uct/test/05_training_test.py b/frontend/uct/test/05_training_test.py
new file mode 100644
index 0000000..5fdb7b0
--- /dev/null
+++ b/frontend/uct/test/05_training_test.py
@@ -0,0 +1,128 @@
+import uctc.nn as nn
+import std_model as stdnn
+import numpy as np
+np.random.seed(42)
+class LinearTestModel:
+    def __init__(self, input_features, hidden_features, output_features):
+        self.w1 = nn.Parameter([input_features, hidden_features])
+        self.b1 = nn.Parameter([1, hidden_features])
+        self.w2 = nn.Parameter([hidden_features, output_features])
+        self.b2 = nn.Parameter([1, output_features])
+    
+    def forward(self, x):
+        layer_1 = nn.ReLU(nn.AddBias(nn.Linear(x, self.w1), self.b1))
+        prediction = nn.AddBias(nn.Linear(layer_1, self.w2), self.b2)
+        # print(f"o1: {prediction.data()[:10]}")
+        return prediction
+    
+    def get_loss(self, x, y):
+        return nn.SquareLoss(self.forward(x), y)
+    
+    def backward(self, x, y):
+        loss = self.get_loss(x, y)
+        g_w1, g_b1, g_w2, g_b2 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
+        return g_w1.data(), g_b1.data(), g_w2.data(), g_b2.data()
+    
+    def update(self, x, y, lr):
+        loss = self.get_loss(x, y)
+        g_w1, g_b1, g_w2, g_b2 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
+        self.w1.update(g_w1, lr)
+        self.b1.update(g_b1, lr)
+        self.w2.update(g_w2, lr)
+        self.b2.update(g_b2, lr)
+        # print(g_w1.data())
+        # print(g_b1.data())
+        # print(g_w2.data())
+        # print(g_b2.data())
+        # return self.w1.data(), self.b1.data(), self.w2.data(), self.b2.data()
+    
+    def train(self):
+        self.x = np.expand_dims(np.linspace(-2 * np.pi, 2 * np.pi, num=200), axis=1)
+        # np.random.RandomState(0).shuffle(self.x)
+        self.argsort_x = np.argsort(self.x.flatten())
+        self.y = np.sin(self.x)
+        for i in range(epoch):
+            np.random.RandomState(0).shuffle(self.x)
+            index = 0
+            while index < self.x.shape[0]:
+                x = self.x[index:index + batch_size]
+                y = self.y[index:index + batch_size]
+                cx = nn.Constant(x)
+                cy = nn.Constant(y)
+                self.update(cx, cy, 0.01)
+                index += batch_size
+                # break
+            loss = self.get_loss(cx,cy)
+            print(loss.data())
+
+
+class StdLinerTestModel:
+    def __init__(self, input_features, hidden_features, output_features, tmodel: LinearTestModel):
+        self.w1 = stdnn.Parameter(input_features, hidden_features)
+        self.b1 = stdnn.Parameter(1, hidden_features)
+        self.w2 = stdnn.Parameter(hidden_features, output_features)
+        self.b2 = stdnn.Parameter(1, output_features)
+        # self.w1.data = np.array(tmodel.w1.data()).reshape(input_features, hidden_features)
+        # self.b1.data = np.array(tmodel.b1.data()).reshape(1, hidden_features)
+        # self.w2.data = np.array(tmodel.w2.data()).reshape(hidden_features, output_features)
+        # self.b2.data = np.array(tmodel.b2.data()).reshape(1, output_features)
+        # print(self.w1.data)
+        
+
+    def forward(self, x):
+        layer_1 = stdnn.ReLU(stdnn.AddBias(stdnn.Linear(x, self.w1), self.b1))
+        prediction = stdnn.AddBias(stdnn.Linear(layer_1, self.w2), self.b2)
+        # print(f"o2: {prediction.data.flatten()[:10]}")
+        return prediction
+    
+    def get_loss(self, x, y):
+        return stdnn.SquareLoss(self.forward(x), y)
+    
+    def backward(self, x, y):
+        loss = self.get_loss(x, y)
+        g_w1, g_b1, g_w2, g_b2 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
+        return g_w1.data.flatten().tolist(), g_b1.data.flatten().tolist(), g_w2.data.flatten().tolist(), g_b2.data.flatten().tolist()
+    
+    def update(self, x, y, lr):
+        # loss = self.get_loss(x, y)
+        # g_w1, g_b1, g_w2, g_b2 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
+        loss = self.get_loss(x, y)
+        g_w1, g_b1, g_w2, g_b2 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2])
+        self.w1.update(g_w1, -lr)
+        self.b1.update(g_b1, -lr)
+        self.w2.update(g_w2, -lr)
+        self.b2.update(g_b2, -lr)
+        # print(loss.data)
+        # return self.w1.data.flatten().tolist(), self.b1.data.flatten().tolist(), self.w2.data.flatten().tolist(), self.b2.data.flatten().tolist()
+    
+    def train(self):
+        self.x = np.expand_dims(np.linspace(-2 * np.pi, 2 * np.pi, num=200), axis=1)
+        self.argsort_x = np.argsort(self.x.flatten())
+        self.y = np.sin(self.x)
+        for i in range(epoch):
+            # np.random.RandomState(0).shuffle(self.x)
+            index = 0
+            while index < self.x.shape[0]:
+                x = self.x[index:index + batch_size]
+                y = self.y[index:index + batch_size]
+                cx = stdnn.Constant(x)
+                cy = stdnn.Constant(y)
+                self.update(cx, cy, 0.01)
+                index += batch_size
+                break
+            loss = self.get_loss(cx, cy)
+            print(loss.data)
+
+input_features = 1
+hidden_features = 50
+output_features = 1
+batch_size = 10
+epoch = 1
+
+model = LinearTestModel(input_features, hidden_features, output_features)
+smodel = StdLinerTestModel(input_features, hidden_features, output_features, model)
+
+# model.train()
+
+
+smodel.train()
\ No newline at end of file
diff --git a/frontend/uct/test/06_mnist_test.py b/frontend/uct/test/06_mnist_test.py
new file mode 100644
index 0000000..de0f268
--- /dev/null
+++ b/frontend/uct/test/06_mnist_test.py
@@ -0,0 +1,144 @@
+import uctc.nn as nn
+import std_model as stdnn
+import numpy as np
+from data6 import x, y
+np.random.seed(42)
+
+def parameter_data(*shape):
+    assert len(shape) == 2, (
+            "Shape must have 2 dimensions, instead has {}".format(len(shape)))
+    assert all(isinstance(dim, int) and dim > 0 for dim in shape), (
+            "Shape must consist of positive integers, got {!r}".format(shape))
+    limit = np.sqrt(3.0 / np.mean(shape))
+    data = np.random.uniform(low=-limit, high=limit, size=shape).astype(np.float32)
+    return data
+
+
+class MNISTModel:
+    def __init__(self):
+        self.input_features = 784
+        self.h1 = 200
+        self.h2 = 100
+        self.output_features = 10
+        self.lr = 0.01
+        self.batch_size = 100
+        self.w1data = parameter_data(self.input_features, self.h1)
+        self.b1data = parameter_data(1, self.h1)
+        self.w2data = parameter_data(self.h1, self.h2)
+        self.b2data = parameter_data(1, self.h2)
+        self.w3data = parameter_data(self.h2, self.output_features)
+        self.b3data = parameter_data(1, self.output_features)
+        self.w1 = nn.Parameter(self.w1data)
+        self.b1 = nn.Parameter(self.b1data)
+        self.w2 = nn.Parameter(self.w2data)
+        self.b2 = nn.Parameter(self.b2data)
+        self.w3 = nn.Parameter(self.w3data)
+        self.b3 = nn.Parameter(self.b3data)
+    
+    def run(self, x):
+        l1 = nn.ReLU(nn.AddBias(nn.Linear(x, self.w1), self.b1))
+        l2 = nn.ReLU(nn.AddBias(nn.Linear(l1, self.w2), self.b2))
+        l3 = nn.AddBias(nn.Linear(l2, self.w3), self.b3)
+        return l3
+
+    def get_loss(self, x, y):
+        return nn.SoftmaxLoss(self.run(x), y)
+
+    def train(self, x, y):
+        loss = self.get_loss(x, y)
+        g_w1, g_b1, g_w2, g_b2, g_w3, g_b3 = nn.gradients(loss, [self.w1, self.b1, self.w2, self.b2, self.w3, self.b3])
+        self.w1.update(g_w1, self.lr)
+        self.b1.update(g_b1, self.lr)
+        self.w2.update(g_w2, self.lr)
+        self.b2.update(g_b2, self.lr)
+        self.w3.update(g_w3, self.lr)
+        self.b3.update(g_b3, self.lr)
+        return g_w1.data(), g_b1.data(), g_w2.data(), g_b2.data(), g_w3.data(), g_b3.data()
+
+class StdMNISTModel:
+    def __init__(self, model: MNISTModel):
+        self.input_features = 784
+        self.h1 = 200
+        self.h2 = 100
+        self.output_features = 10
+        self.lr = 0.01
+        self.batch_size = 100
+        self.w1 = stdnn.Parameter(self.input_features, self.h1)
+        self.w1.data = model.w1data
+        self.b1 = stdnn.Parameter(1, self.h1)
+        self.b1.data = model.b1data
+        self.w2 = stdnn.Parameter(self.h1, self.h2)
+        self.w2.data = model.w2data
+        self.b2 = stdnn.Parameter(1, self.h2)
+        self.b2.data = model.b2data
+        self.w3 = stdnn.Parameter(self.h2, self.output_features)
+        self.w3.data = model.w3data
+        self.b3 = stdnn.Parameter(1, self.output_features)
+        self.b3.data = model.b3data
+        
+    
+    def run(self, x):
+        l1 = stdnn.ReLU(stdnn.AddBias(stdnn.Linear(x, self.w1), self.b1))
+        l2 = stdnn.ReLU(stdnn.AddBias(stdnn.Linear(l1, self.w2), self.b2))
+        l3 = stdnn.AddBias(stdnn.Linear(l2, self.w3), self.b3)
+        return l3
+
+    def get_loss(self, x, y):
+        return stdnn.SoftmaxLoss(self.run(x), y)
+
+    def train(self, x, y):
+        loss = self.get_loss(x, y)
+        g_w1, g_b1, g_w2, g_b2, g_w3, g_b3 = stdnn.gradients(loss, [self.w1, self.b1, self.w2, self.b2, self.w3, self.b3])
+        self.w1.update(g_w1, -self.lr)
+        self.b1.update(g_b1, -self.lr)
+        self.w2.update(g_w2, -self.lr)
+        self.b2.update(g_b2, -self.lr)
+        self.w3.update(g_w3, -self.lr)
+        self.b3.update(g_b3, -self.lr)
+        return g_w1.data.flatten().tolist(), g_b1.data.flatten().tolist(), g_w2.data.flatten().tolist(), g_b2.data.flatten().tolist(), g_w3.data.flatten().tolist(), g_b3.data.flatten().tolist()
+
+model = MNISTModel()
+smodel = StdMNISTModel(model)
+
+o1_x = nn.Constant(x)
+o1_y = nn.Constant(y)
+o1_out = model.run(o1_x).data()
+print(o1_out)
+# o1_loss = model.get_loss(o1_x, o1_y)
+# print(o1_loss.data()[0])
+# o1_gw1, o1_gb1, o1_gw2, o1_gb2, o1_gw3, o1_gb3 = model.train(o1_x, o1_y)
+
+o2_x = stdnn.Constant(x)
+o2_y = stdnn.Constant(y)
+o2_out = smodel.run(o2_x).data
+print(o2_out)
+# o2_loss = smodel.get_loss(o2_x, o2_y)
+# print(o2_loss.data)
+# o2_gw1, o2_gb1, o2_gw2, o2_gb2, o2_gw3, o2_gb3 = smodel.train(o2_x, o2_y)
+
+# for i, (a, b) in enumerate(zip(o1_gw1, o2_gw1)):
+#     if abs(a - b) > 1e-4:
+#         print(f"gw1 failed: {i, a, b}")
+#         break
+# for i, (a, b) in enumerate(zip(o1_gb1, o2_gb1)):
+#     if abs(a - b) > 1e-4:
+#         print(f"gb1 failed: {i, a, b}")
+#         break  
+# for i, (a, b) in enumerate(zip(o1_gw2, o2_gw2)):
+#     if abs(a - b) > 1e-4:
+#         print(f"gw2 failed: {i, a, b}")
+#         break  
+# for i, (a, b) in enumerate(zip(o1_gb2, o2_gb2)):
+#     if abs(a - b) > 1e-4:
+#         print(f"gb2 failed: {i, a, b}")
+#         break 
+# for i, (a, b) in enumerate(zip(o1_gw3, o2_gw3)):
+#     if abs(a - b) > 1e-4:
+#         print(f"gw3 failed: {i, a, b}")
+#         break 
+# for i, (a, b) in enumerate(zip(o1_gb3, o2_gb3)):
+#     if abs(a - b) > 1e-4:
+#         print(f"gb3 failed: {i, a, b}")
+#         break
+# print(o1_loss.data()[0], o2_loss.data)
+print("PASSED")
\ No newline at end of file
diff --git a/frontend/uct/test/data6.py b/frontend/uct/test/data6.py
new file mode 100644
index 0000000..f144b96
--- /dev/null
+++ b/frontend/uct/test/data6.py
@@ -0,0 +1,4 @@
+import numpy as np
+x = np.array([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3294117748737335, 0.7254902124404907, 0.6235294342041016, 0.5921568870544434, 0.2352941334247589, 0.1411764770746231, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8705883026123047, 0.9960784912109375, 0.9960784912109375, 0.9960784912109375, 0.9960784912109375, 0.9450981020927429, 0.7764706611633301, 0.7764706611633301, 0.7764706611633301, 0.7764706611633301, 0.7764706611633301, 0.7764706611633301, 0.7764706611633301, 0.7764706611633301, 0.6666666865348816, 0.2039215862751007, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.26274511218070984, 0.44705885648727417, 0.2823529541492462, 0.44705885648727417, 0.6392157077789307, 0.8901961445808411, 0.9960784912109375, 0.8823530077934265, 0.9960784912109375, 0.9960784912109375, 0.9960784912109375, 0.9803922176361084, 0.8980392813682556, 0.9960784912109375, 0.9960784912109375, 0.5490196347236633, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06666667014360428, 0.25882354378700256, 0.05490196496248245, 0.26274511218070984, 0.26274511218070984, 0.26274511218070984, 0.23137256503105164, 0.08235294371843338, 0.9254902601242065, 0.9960784912109375, 0.41568630933761597, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.32549020648002625, 0.9921569228172302, 0.8196079134941101, 0.07058823853731155, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.08627451211214066, 0.9137255549430847, 1.0, 0.32549020648002625, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5058823823928833, 0.9960784912109375, 0.9333333969116211, 0.1725490242242813, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.23137256503105164, 0.9764706492424011, 0.9960784912109375, 0.24313727021217346, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5215686559677124, 0.9960784912109375, 0.7333333492279053, 0.019607843831181526, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03529411926865578, 0.803921639919281, 0.9725490808486938, 0.22745099663734436, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4941176772117615, 0.9960784912109375, 0.7137255072593689, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.29411765933036804, 0.9843137860298157, 0.9411765336990356, 0.22352942824363708, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07450980693101883, 0.8666667342185974, 0.9960784912109375, 0.6509804129600525, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011764707043766975, 0.7960785031318665, 0.9960784912109375, 0.8588235974311829, 0.13725490868091583, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.14901961386203766, 0.9960784912109375, 0.9960784912109375, 0.3019607961177826, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.12156863510608673, 0.8784314393997192, 0.9960784912109375, 0.45098042488098145, 0.003921568859368563, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5215686559677124, 0.9960784912109375, 0.9960784912109375, 0.2039215862751007, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2392157018184662, 0.9490196704864502, 0.9960784912109375, 0.9960784912109375, 0.2039215862751007, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4745098352432251, 0.9960784912109375, 0.9960784912109375, 0.8588235974311829, 0.1568627506494522, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4745098352432251, 0.9960784912109375, 0.8117647767066956, 0.07058823853731155, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.14901961386203766, 0.9960784912109375, 0.4274510145187378, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.34117648005485535, 0.988235354423523, 0.32156863808631897, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.529411792755127, 0.9450981020927429, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1764705926179886, 0.9568628072738647, 0.5882353186607361, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3294117748737335, 0.9960784912109375, 0.24705883860588074, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7921569347381592, 0.874509871006012, 0.04313725605607033, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.125490203499794, 0.9960784912109375, 0.847058892250061, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.37254902720451355, 0.9960784912109375, 0.7647059559822083, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5490196347236633, 0.9960784912109375, 0.3019607961177826, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.22352942824363708, 0.9294118285179138, 0.803921639919281, 0.0313725508749485, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4862745404243469, 1.0, 0.6470588445663452, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6705882549285889, 0.9960784912109375, 0.3176470696926117, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0941176563501358, 0.9098039865493774, 0.8431373238563538, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4705882668495178, 0.9960784912109375, 0.6235294342041016, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5921568870544434, 0.9960784912109375, 0.5568627715110779, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8941177129745483, 0.9960784912109375, 0.25882354378700256, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2392157018184662, 0.9843137860298157, 0.9960784912109375, 0.25882354378700256, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5529412031173706, 0.9960784912109375, 0.803921639919281, 0.011764707043766975, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03921568766236305, 0.8431373238563538, 0.9960784912109375, 0.4745098352432251, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.019607843831181526, 0.7764706611633301, 0.6901960968971252, 0.03921568766236305, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]).reshape(2, 784)
+
+y = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]).reshape(100, 10)
\ No newline at end of file
diff --git a/frontend/uct/test/std_model.py b/frontend/uct/test/std_model.py
new file mode 100644
index 0000000..1f6eded
--- /dev/null
+++ b/frontend/uct/test/std_model.py
@@ -0,0 +1,393 @@
+import numpy as np
+
+def format_shape(shape):
+    return "x".join(map(str, shape)) if shape else "()"
+
+class Node(object):
+    def __repr__(self):
+        return "<{} shape={} at {}>".format(
+            type(self).__name__, format_shape(self.data.shape), hex(id(self)))
+
+class DataNode(Node):
+    """
+    DataNode is the parent class for Parameter and Constant nodes.
+
+    You should not need to use this class directly.
+    """
+    def __init__(self, data):
+        self.parents = []
+        self.data = data
+
+    def _forward(self, *inputs):
+        return self.data
+
+    @staticmethod
+    def _backward(gradient, *inputs):
+        return []
+
+class Parameter(DataNode):
+    """
+    A Parameter node stores parameters used in a neural network (or perceptron).
+
+    Use the the `update` method to update parameters when training the
+    perceptron or neural network.
+    """
+    def __init__(self, *shape):
+        assert len(shape) == 2, (
+            "Shape must have 2 dimensions, instead has {}".format(len(shape)))
+        assert all(isinstance(dim, int) and dim > 0 for dim in shape), (
+            "Shape must consist of positive integers, got {!r}".format(shape))
+        limit = np.sqrt(3.0 / np.mean(shape))
+        data = np.random.uniform(low=-limit, high=limit, size=shape)
+        super().__init__(data)
+
+    def update(self, direction, multiplier):
+        assert isinstance(direction, Constant), (
+            "Update direction must be a {} node, instead has type {!r}".format(
+                Constant.__name__, type(direction).__name__))
+        assert direction.data.shape == self.data.shape, (
+            "Update direction shape {} does not match parameter shape "
+            "{}".format(
+                format_shape(direction.data.shape),
+                format_shape(self.data.shape)))
+        assert isinstance(multiplier, (int, float)), (
+            "Multiplier must be a Python scalar, instead has type {!r}".format(
+                type(multiplier).__name__))
+        self.data += multiplier * direction.data
+        assert np.all(np.isfinite(self.data)), (
+            "Parameter contains NaN or infinity after update, cannot continue")
+
+class Constant(DataNode):
+    """
+    A Constant node is used to represent:
+    * Input features
+    * Output labels
+    * Gradients computed by back-propagation
+
+    You should not need to construct any Constant nodes directly; they will
+    instead be provided by either the dataset or when you call `nn.gradients`.
+    """
+    def __init__(self, data):
+        assert isinstance(data, np.ndarray), (
+            "Data should be a numpy array, instead has type {!r}".format(
+                type(data).__name__))
+        assert np.issubdtype(data.dtype, np.floating), (
+            "Data should be a float array, instead has data type {!r}".format(
+                data.dtype))
+        super().__init__(data)
+
+class FunctionNode(Node):
+    """
+    A FunctionNode represents a value that is computed based on other nodes.
+    The FunctionNode class performs necessary book-keeping to compute gradients.
+    """
+    def __init__(self, *parents):
+        assert all(isinstance(parent, Node) for parent in parents), (
+            "Inputs must be node objects, instead got types {!r}".format(
+                tuple(type(parent).__name__ for parent in parents)))
+        self.parents = parents
+        self.data = self._forward(*(parent.data for parent in parents))
+
+class Add(FunctionNode):
+    """
+    Adds matrices element-wise.
+
+    Usage: nn.Add(x, y)
+    Inputs:
+        x: a Node with shape (batch_size x num_features)
+        y: a Node with the same shape as x
+    Output:
+        a Node with shape (batch_size x num_features)
+    """
+    @staticmethod
+    def _forward(*inputs):
+        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
+        assert inputs[0].ndim == 2, (
+            "First input should have 2 dimensions, instead has {}".format(
+                inputs[0].ndim))
+        assert inputs[1].ndim == 2, (
+            "Second input should have 2 dimensions, instead has {}".format(
+                inputs[1].ndim))
+        assert inputs[0].shape == inputs[1].shape, (
+            "Input shapes should match, instead got {} and {}".format(
+                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
+        return inputs[0] + inputs[1]
+
+    @staticmethod
+    def _backward(gradient, *inputs):
+        assert gradient.shape == inputs[0].shape
+        return [gradient, gradient]
+
+class AddBias(FunctionNode):
+    """
+    Adds a bias vector to each feature vector
+
+    Usage: nn.AddBias(features, bias)
+    Inputs:
+        features: a Node with shape (batch_size x num_features)
+        bias: a Node with shape (1 x num_features)
+    Output:
+        a Node with shape (batch_size x num_features)
+    """
+    @staticmethod
+    def _forward(*inputs):
+        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
+        assert inputs[0].ndim == 2, (
+            "First input should have 2 dimensions, instead has {}".format(
+                inputs[0].ndim))
+        assert inputs[1].ndim == 2, (
+            "Second input should have 2 dimensions, instead has {}".format(
+                inputs[1].ndim))
+        assert inputs[1].shape[0] == 1, (
+            "First dimension of second input should be 1, instead got shape "
+            "{}".format(format_shape(inputs[1].shape)))
+        assert inputs[0].shape[1] == inputs[1].shape[1], (
+            "Second dimension of inputs should match, instead got shapes {} "
+            "and {}".format(
+                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
+        return inputs[0] + inputs[1]
+
+    @staticmethod
+    def _backward(gradient, *inputs):
+        assert gradient.shape == inputs[0].shape
+        return [gradient, np.sum(gradient, axis=0, keepdims=True)]
+
+class DotProduct(FunctionNode):
+    """
+    Batched dot product
+
+    Usage: nn.DotProduct(features, weights)
+    Inputs:
+        features: a Node with shape (batch_size x num_features)
+        weights: a Node with shape (1 x num_features)
+    Output: a Node with shape (batch_size x 1)
+    """
+    @staticmethod
+    def _forward(*inputs):
+        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
+        assert inputs[0].ndim == 2, (
+            "First input should have 2 dimensions, instead has {}".format(
+                inputs[0].ndim))
+        assert inputs[1].ndim == 2, (
+            "Second input should have 2 dimensions, instead has {}".format(
+                inputs[1].ndim))
+        assert inputs[1].shape[0] == 1, (
+            "First dimension of second input should be 1, instead got shape "
+            "{}".format(format_shape(inputs[1].shape)))
+        assert inputs[0].shape[1] == inputs[1].shape[1], (
+            "Second dimension of inputs should match, instead got shapes {} "
+            "and {}".format(
+                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
+        return np.dot(inputs[0], inputs[1].T)
+
+    @staticmethod
+    def _backward(gradient, *inputs):
+        # assert gradient.shape[0] == inputs[0].shape[0]
+        # assert gradient.shape[1] == 1
+        # return [np.dot(gradient, inputs[1]), np.dot(gradient.T, inputs[0])]
+        raise NotImplementedError(
+            "Backpropagation through DotProduct nodes is not needed in this "
+            "assignment")
+
+class Linear(FunctionNode):
+    """
+    Applies a linear transformation (matrix multiplication) to the input
+
+    Usage: nn.Linear(features, weights)
+    Inputs:
+        features: a Node with shape (batch_size x input_features)
+        weights: a Node with shape (input_features x output_features)
+    Output: a node with shape (batch_size x input_features)
+    """
+    @staticmethod
+    def _forward(*inputs):
+        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
+        assert inputs[0].ndim == 2, (
+            "First input should have 2 dimensions, instead has {}".format(
+                inputs[0].ndim))
+        assert inputs[1].ndim == 2, (
+            "Second input should have 2 dimensions, instead has {}".format(
+                inputs[1].ndim))
+        assert inputs[0].shape[1] == inputs[1].shape[0], (
+            "Second dimension of first input should match first dimension of "
+            "second input, instead got shapes {} and {}".format(
+                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
+        return np.dot(inputs[0], inputs[1])
+
+    @staticmethod
+    def _backward(gradient, *inputs):
+        assert gradient.shape[0] == inputs[0].shape[0]
+        assert gradient.shape[1] == inputs[1].shape[1]
+        return [np.dot(gradient, inputs[1].T), np.dot(inputs[0].T, gradient)]
+
+class ReLU(FunctionNode):
+    """
+    An element-wise Rectified Linear Unit nonlinearity: max(x, 0).
+    This nonlinearity replaces all negative entries in its input with zeros.
+
+    Usage: nn.ReLU(x)
+    Input:
+        x: a Node with shape (batch_size x num_features)
+    Output: a Node with the same shape as x, but no negative entries
+    """
+    @staticmethod
+    def _forward(*inputs):
+        assert len(inputs) == 1, "Expected 1 input, got {}".format(len(inputs))
+        assert inputs[0].ndim == 2, (
+            "Input should have 2 dimensions, instead has {}".format(
+                inputs[0].ndim))
+        return np.maximum(inputs[0], 0)
+
+    @staticmethod
+    def _backward(gradient, *inputs):
+        assert gradient.shape == inputs[0].shape
+        return [gradient * np.where(inputs[0] > 0, 1.0, 0.0)]
+
+class SquareLoss(FunctionNode):
+    """
+    This node first computes 0.5 * (a[i,j] - b[i,j])**2 at all positions (i,j)
+    in the inputs, which creates a (batch_size x dim) matrix. It then calculates
+    and returns the mean of all elements in this matrix.
+
+    Usage: nn.SquareLoss(a, b)
+    Inputs:
+        a: a Node with shape (batch_size x dim)
+        b: a Node with shape (batch_size x dim)
+    Output: a scalar Node (containing a single floating-point number)
+    """
+    @staticmethod
+    def _forward(*inputs):
+        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
+        assert inputs[0].ndim == 2, (
+            "First input should have 2 dimensions, instead has {}".format(
+                inputs[0].ndim))
+        assert inputs[1].ndim == 2, (
+            "Second input should have 2 dimensions, instead has {}".format(
+                inputs[1].ndim))
+        assert inputs[0].shape == inputs[1].shape, (
+            "Input shapes should match, instead got {} and {}".format(
+                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
+        return np.mean(np.square(inputs[0] - inputs[1]) / 2)
+
+    @staticmethod
+    def _backward(gradient, *inputs):
+        assert np.asarray(gradient).ndim == 0
+        return [
+            gradient * (inputs[0] - inputs[1]) / inputs[0].size,
+            gradient * (inputs[1] - inputs[0]) / inputs[0].size
+        ]
+
+class SoftmaxLoss(FunctionNode):
+    """
+    A batched softmax loss, used for classification problems.
+
+    IMPORTANT: do not swap the order of the inputs to this node!
+
+    Usage: nn.SoftmaxLoss(logits, labels)
+    Inputs:
+        logits: a Node with shape (batch_size x num_classes). Each row
+            represents the scores associated with that example belonging to a
+            particular class. A score can be an arbitrary real number.
+        labels: a Node with shape (batch_size x num_classes) that encodes the
+            correct labels for the examples. All entries must be non-negative
+            and the sum of values along each row should be 1.
+    Output: a scalar Node (containing a single floating-point number)
+    """
+    @staticmethod
+    def log_softmax(logits):
+        log_probs = logits - np.max(logits, axis=1, keepdims=True)
+        log_probs -= np.log(np.sum(np.exp(log_probs), axis=1, keepdims=True))
+        return log_probs
+
+    @staticmethod
+    def _forward(*inputs):
+        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
+        assert inputs[0].ndim == 2, (
+            "First input should have 2 dimensions, instead has {}".format(
+                inputs[0].ndim))
+        assert inputs[1].ndim == 2, (
+            "Second input should have 2 dimensions, instead has {}".format(
+                inputs[1].ndim))
+        assert inputs[0].shape == inputs[1].shape, (
+            "Input shapes should match, instead got {} and {}".format(
+                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
+        assert np.all(inputs[1] >= 0), (
+            "All entries in the labels input must be non-negative")
+        assert np.allclose(np.sum(inputs[1], axis=1), 1), (
+            "Labels input must sum to 1 along each row")
+        log_probs = SoftmaxLoss.log_softmax(inputs[0])
+        return np.mean(-np.sum(inputs[1] * log_probs, axis=1))
+
+    @staticmethod
+    def _backward(gradient, *inputs):
+        assert np.asarray(gradient).ndim == 0
+        log_probs = SoftmaxLoss.log_softmax(inputs[0])
+        return [
+            gradient * (np.exp(log_probs) - inputs[1]) / inputs[0].shape[0],
+            gradient * -log_probs / inputs[0].shape[0]
+        ]
+
+def gradients(loss, parameters):
+    """
+    Computes and returns the gradient of the loss with respect to the provided
+    parameters.
+
+    Usage: nn.gradients(loss, parameters)
+    Inputs:
+        loss: a SquareLoss or SoftmaxLoss node
+        parameters: a list (or iterable) containing Parameter nodes
+    Output: a list of Constant objects, representing the gradient of the loss
+        with respect to each provided parameter.
+    """
+
+    assert isinstance(loss, (SquareLoss, SoftmaxLoss)), (
+        "Loss must be a loss node, instead has type {!r}".format(
+            type(loss).__name__))
+    assert all(isinstance(parameter, Parameter) for parameter in parameters), (
+        "Parameters must all have type {}, instead got types {!r}".format(
+            Parameter.__name__,
+            tuple(type(parameter).__name__ for parameter in parameters)))
+    assert not hasattr(loss, "used"), (
+        "Loss node has already been used for backpropagation, cannot reuse")
+
+    loss.used = True
+
+    nodes = set()
+    tape = []
+
+    def visit(node):
+        if node not in nodes:
+            for parent in node.parents:
+                visit(parent)
+            nodes.add(node)
+            tape.append(node)
+
+    visit(loss)
+    nodes |= set(parameters)
+
+    grads = {node: np.zeros_like(node.data) for node in nodes}
+    grads[loss] = 1.0
+
+    for node in reversed(tape):
+        parent_grads = node._backward(
+            grads[node], *(parent.data for parent in node.parents))
+        for parent, parent_grad in zip(node.parents, parent_grads):
+            grads[parent] += parent_grad
+
+    return [Constant(grads[parameter]) for parameter in parameters]
+
+def as_scalar(node):
+    """
+    Returns the value of a Node as a standard Python number. This only works
+    for nodes with one element (e.g. SquareLoss and SoftmaxLoss, as well as
+    DotProduct with a batch size of 1 element).
+    """
+
+    assert isinstance(node, Node), (
+        "Input must be a node object, instead has type {!r}".format(
+            type(node).__name__))
+    assert node.data.size == 1, (
+        "Node has shape {}, cannot convert to a scalar".format(
+            format_shape(node.data.shape)))
+    node.data = node.data.flatten()
+    return node.data.tolist()[0]
diff --git a/frontend/uct/transformer.py b/frontend/uct/transformer.py
new file mode 100644
index 0000000..e69de29
diff --git a/frontend/uct/utils.py b/frontend/uct/utils.py
new file mode 100644
index 0000000..380885f
--- /dev/null
+++ b/frontend/uct/utils.py
@@ -0,0 +1,45 @@
+import numpy as np
+import uctc.nn as nn
+np.random.seed(42)
+def parameter_data(*shape):
+    assert len(shape) == 2, (
+            "Shape must have 2 dimensions, instead has {}".format(len(shape)))
+    assert all(isinstance(dim, int) and dim > 0 for dim in shape), (
+            "Shape must consist of positive integers, got {!r}".format(shape))
+    limit = np.sqrt(3.0 / np.mean(shape))
+    data = np.random.uniform(low=-limit, high=limit, size=shape).astype(np.float32)
+    return data
+
+class Dataset(object):
+    def __init__(self, x, y):
+        assert isinstance(x, np.ndarray)
+        assert isinstance(y, np.ndarray)
+        assert np.issubdtype(x.dtype, np.floating)
+        assert np.issubdtype(y.dtype, np.floating)
+        assert x.ndim == 2
+        assert y.ndim == 2
+        assert x.shape[0] == y.shape[0]
+        self.x = x
+        self.y = y
+
+    def iterate_once(self, batch_size):
+        assert isinstance(batch_size, int) and batch_size > 0, (
+            f"Batch size should be a positive integer, got {batch_size}")
+        assert self.x.shape[0] % batch_size == 0, (
+            f"Dataset size {self.x.shape[0]} is not divisible by batch size {batch_size}")
+        index = 0
+        while index < self.x.shape[0]:
+            x = self.x[index:index + batch_size]
+            y = self.y[index:index + batch_size]
+            yield nn.Constant(x), nn.Constant(y)
+            index += batch_size
+
+    def iterate_forever(self, batch_size):
+        while True:
+            yield from self.iterate_once(batch_size)
+
+    def get_validation_accuracy(self):
+        raise NotImplementedError(
+            "No validation data is available for this dataset. "
+            "In this assignment, only the Digit Classification and Language "
+            "Identification datasets have validation data.")
\ No newline at end of file
diff --git a/lab-guide/00-intro.md b/lab-guide/00-intro.md
new file mode 100644
index 0000000..ffe0fc8
--- /dev/null
+++ b/lab-guide/00-intro.md
@@ -0,0 +1,36 @@
+### Welcome to uct lab
+
+> uct 是Undergraduate Computing Torch的简写。
+
+欢迎你选择uct作为自己的大实验，在这个大实验中，我们将亲自动手使用C++搭建一个机器学习框架，并完成手写体数据集MNIST的识别。
+
+注意：你不需要获得任何对于神经网络的前置知识，考虑到《大学计算（下）》面向的是本科一年级学生，我们设计了非常详细的实验指导书帮助你完成这个实验。
+
+#### 安装构建工具
+
+大型的C++项目显然不止是几个文件，而是成百上千个文件，因此我们需要一个工具来管理这些文件。有很多课程会使用到类似的工具（在《操作系统》课程上，你将会遇见Makefile；在《编译原理》、《并行编译与优化》上，你将会用到CMake），在这里我们选择CMake。
+
+> CMake 是一个开源的跨平台构建系统生成工具，广泛用于管理软件构建过程。它通过生成标准的构建文件（如 Makefile、Visual Studio 项目文件等）来简化跨平台项目的构建流程。
+
+> 对于经验丰富的同学，如果你喜欢使用别的构建工具（例如Bazel）也是可以的~
+
+假如你也正在使用WSL(2)，运行下面的命令可以安装好所需要的工具和库
+
+```bash
+sudo apt update
+sudo apt install -y build-essential cmake git gcc g++
+```
+
+#### 准备Python环境
+
+首先，你需要在Linux下具备Python环境。相信在《大学计算（上）》中，你已经具备这样的技能。我们以使用WSL+VSCode为例介绍环境配置的具体方案。
+
+在VSCode中连接WSL，打开对应目录。
+
+使用`conda`创建一个环境（或使用已有环境），然后执行
+
+```
+pip install pybind11
+```
+
+而后，通过`pip show pybind11`可以找到`pybind11`的安装路径，将对应的头文件路径添加到`.vscode/c_cpp_properties.json`的`includePath`中。
\ No newline at end of file
diff --git a/lab-guide/01-fundamentals.md b/lab-guide/01-fundamentals.md
new file mode 100644
index 0000000..9b7edbe
--- /dev/null
+++ b/lab-guide/01-fundamentals.md
@@ -0,0 +1,117 @@
+### 第一部分：基本操作
+
+#### 基本函数的构建
+
+在这一部分中，我们将完成基本的四则运算和由它们组合而成的初等函数的构建。你需要在cc/operators中补全`ops.h`和`ops.cc`的内容。
+
+**[TASK 1]** 在`ops.h`中，你需要补全以下函数的实现：
+
+- `mul`函数，输入为两个数`a`、`b`，输出为它们的乘积。
+
+- `id`函数，将输入原样输出。
+
+- `add`函数，输入为两个数`a`、`b`，输出为它们的和。
+
+- `neg`函数，输入为`a`，输出为`-a`。
+
+- `lt`函数，输入为两个数`a`、`b`，输出为`(float)(a < b)`。
+
+- `eq`函数，输入为两个数`a`、`b`，输出为`(float)(a == b)`。
+
+- `max`函数，输入为两个数`a`、`b`，输出为`a`和`b`中较大的那个。
+
+它们都是模板函数，相信你已经注意到了，它们都被定义在`.h`文件中，而不是`.cc`文件中，这与C++的模板的实例化机制和编译模型有关。
+
+模板的实例化机制：模板函数或模板类并不是真正的代码，而是一个“蓝图”或“模式”，编译器在编译时根据这个蓝图生成具体的代码。这个过程称为模板实例化。例如，当你使用一个模板函数时，编译器会根据你传递的类型参数生成一个具体的函数版本。这个生成的过程发生在编译时。
+
+编译模型：C++采用的是分离编译模型，即每个源文件（.cc 或 .cpp 文件）是独立编译的。编译器在编译一个源文件时，只会看到该源文件及其包含的头文件中的内容。如果你将模板函数的定义放在源文件中，其他源文件在编译时无法看到模板的定义，因此无法生成对应的实例化代码。
+
+另外，你应当还注意到了我们为这两个文件提供了名叫`operators`的命名空间（namespace）。主要是为了防止不同命名空间中的重名冲突。
+
+**[TASK 2]** 在`ops.cc`中，你需要完成以下函数的实现：
+
+- `is_close`函数，输入为两个数`x`、`y`，输出为`(float)(abs(x - y) < epsilon)`。
+
+- `sigmoid`函数，输入为`x`，为了方便计算，在输出时遵照下面的规则：
+
+$$
+f(x) =\left\{\begin{matrix}
+\frac{1.0}{(1.0 + e^{-x})}, x\ge 0
+ \\
+\frac{e^x}{(1.0 + e^{x})}, \mathrm{otherwise}
+\end{matrix}\right.
+$$
+
+- `relu`函数，输入为`x`，输出为`x > 0.0 ? x : 0.0`。
+
+- `inv`函数，输入为`x`，输出为`1.0 / x`。
+
+- `inv_back`函数，用于计算$f(x)=\frac{1}{x}$的微分$f(x)\mathrm{d}x$，输入为`x`和`d`，输出为$-\frac{d}{x^2}$。
+
+- `relu_back`函数，输入为`x`和`d`，输出为`x > 0.0 ? d*1.0 : 0.0`。
+
+#### 函数式编程基础
+
+实现`map`、`zipWith`和`reduce`。
+
+`map`接受一个`std::vector`和一个函数作为输入，返回一个新的`std::vector`，其中每个元素都是输入函数应用于输入`std::vector`中对应元素的结果。具体来说，对于下面这个实现：
+
+```cpp
+template<typename T, typename F>
+auto map(const std::vector<T>& vec, F func) -> std::vector<decltype(func(std::declval<T>()))> {
+
+    std::vector<decltype(func(std::declval<T>()))> result;
+    result.reserve(vec.size());
+
+    std::transform(vec.begin(), vec.end(), std::back_inserter(result), func);
+
+    return result;
+}
+```
+
+有几处可能让你感到疑惑的地方。
+
+首先，这里的函数返回值居然和Python一样被后置了！`->` 是 C++11 引入的尾置返回类型语法。它的作用是将函数的返回类型放在函数参数列表之后，而不是放在函数名之前。在某些情况下，返回类型可能依赖于函数参数或模板参数，而这些信息在函数名之前是不可用的。尾置返回类型允许我们在函数参数列表之后推导返回类型。
+
+> 例如，在`map`函数中，返回类型依赖于`func`的返回类型，而`func`的类型在函数名之前是未知的。使用尾置返回类型可以解决这个问题。
+
+其次，我们使用了`std::declval`。`std::declval`是 C++11 引入的一个工具，用于在编译时模拟一个对象的“假实例”，以便在不实际构造对象的情况下推导类型。
+
+```cpp
+decltype(func(std::declval<T>()))
+```
+
+> 在`map`函数中，我们需要推导`func`的返回类型。假设`func`是一个函数对象，接受`T`类型的参数并返回某种类型`R`，我们可以使用`std::declval`来模拟调用`func`的过程。
+
+**[TASK 3]** 在`ops.cc`中，调用我们给出的`map`函数实现和你刚刚完成的`neg`函数，补全`negList`函数（大约需要1行代码）。
+
+**[TASK 4]** 在`ops.h`中，仿照`map`函数，补全`zipWith`函数（大约需要10行代码）。`zipWidth`函数接受两个`vector`和一个函数`func`作为输入，要得到一个新的`vector`，这个`vector`中的元素都是两个`vector`逐元素进行函数`func`操作之后的结果。例如，对于`vec1 = [1, 2, 3]`，`vec2 = [5, 6, 7]`，`func`为`add`，那么将返回`[6, 8, 10]`。注意：在进行`zipWith`函数的实现时，你需要考虑输入的两个`std::vector`长度不一致的情况，对于这种情况，你简单地`throw`一个异常即可。
+
+**[TASK 5]** 在`ops.cc`中，使用你实现的`zipWith`和`add`函数，实现`addLists`函数（大约需要1行代码）。
+
+**[TASK 6]** 实际上你会发现`std::accumulate`（问一问LLM这个是个啥）就能够承担`reduce`函数的功能，因此你可以直接使用`std::accumulate`来实现`reduce`函数。这个任务需要你使用`reduce`函数实现`sumList`（将一个列表中的元素相加）和`prodList`（将一个列表中的元素相乘）函数（大约分别需要1行代码）。
+
+#### 检查结果
+
+做完了？很好，切换到`cc`，执行下面的语句来编译框架
+
+```
+cmake -S . -B build
+cd build
+make
+```
+现在，编辑系统环境变量
+
+```
+echo 'export PYTHONPATH="??????"' >> ~/.bashrc
+```
+
+将??????替换为将刚刚生成的`build`文件夹的绝对目录直接粘贴到这里，这个文件夹的目录应该形如
+
+```Python
+/home/hexu/learn/uc-modern-cpp-student/cc/build
+```
+
+> 可以切换到`build`目录下，执行`pwd`命令来获取绝对路径。
+
+好了，不出意外的话，就再也别动`~/.bashrc`了。现在还有一个`frontend/framework/basis/test_task1.py`文件。切换到目录`frontend/framework/basis/`，直接运行task1到task6的文件，如果没有任何报错，说明你已经完成了这一关！🎉
\ No newline at end of file
diff --git a/lab-guide/02-autodiff.md b/lab-guide/02-autodiff.md
new file mode 100644
index 0000000..eb6a4a1
--- /dev/null
+++ b/lab-guide/02-autodiff.md
@@ -0,0 +1,56 @@
+### 第二部分：自动微分
+
+#### 数值微分
+
+有时候，我们无需知道一个函数具体的表达式，借助导数的定义，利用计算机可以求解出在某一点的导数值。这种方法称为数值微分。举个例子，对于任何一个$f(x)$，我们当然可以根据定义求出其在$x=x_0$处的导数，即
+
+$$f'(x)|_{x=x_0} = \frac{f(x_0+\varepsilon)-f(x_0 - \varepsilon)}{2\varepsilon }$$
+
+其中$\varepsilon$是一个很小的正数。但是，如果$f(x)$的表达式非常复杂，那么我们可能无法直接求出导数。此时，我们可以借助数值微分来求解导数值。下面我们以$f(x)=x^2$为例，演示如何使用数值微分求解导数值。
+
+```python
+import numpy as np
+
+def f(x):
+    return x**2
+
+def numerical_diff(f, x):
+    h = 1e-4
+    return (f(x+h) - f(x-h)) / (2*h)
+
+x = 5.0
+```
+
+当然，你现在需要用C++来完成这件事。
+
+**[TASK 7]** 补全`operators/autodiff.h`中的`central_difference`函数，实现数值微分，求出$f(x_1, x_2, ..., x_n)$在第$arg$个参数处的导数值。
+
+
+#### 高等数学中的导数
+
+还记得$z = x + y$，对$x$和$y$分别求导的结果是什么吗？显然，根据多元函数的求导法则，有$\frac{\partial z}{\partial x}=1$，以及$\frac{\partial z}{\partial y}=1$。如果我们再考虑梯度，那么$z$的梯度就是$\nabla z = (1, 1)$。那么，对于更复杂的函数，比如$f(x, y) = x^2 + y^2$，其梯度$\nabla f$又是什么呢？
+
+**[TASK 8]** 补全`operators/autodiff.h`中的`Add`类，能够对表达式$z = x + y$求导。
+
+提示：补全`forward`和`backward`函数，分别实现前向传播和反向传播。前向传播：得到`a + b`的值；反向传播，得到`a`和`b`的梯度（也就是`a`、`b`分别对于结果的导数再乘上梯度`d_input`）。
+
+**[TASK 9]** 仿照`Add`类构造`operators/autodiff.h`中的`Mul`类，能够对表达式$z = x \cdot y$求导。
+
+**[TASK 10]** 仿照`Add`类构造`operators/autodiff.h`中的`Log`类，能够对表达式$z = log(x)$求导。提示：使用`<cmath>`提供的`logf`函数。
+
+**[TASK 11]** 仿照`Add`类构造`operators/autodiff.h`中的`Inv`类，能够对表达式$z = 1 / x$求导。
+
+**[TASK 12]** 仿照`Add`类构造`operators/autodiff.h`中的`Sigmoid`类，能够对表达式$z = sigmoid(x)$求导。提示：使用`<cmath>`提供的`expf`函数。
+
+#### 检查结果
+
+做完了？很好，切换到`cc`，执行下面的语句来编译框架
+
+```
+cmake -S . -B build
+cd build
+make
+```
+如果你已经完成了01，那么环境变量应该是好的。否则，请回到01的实验手册，查看如何修改环境变量。
+
+现在还有一个`frontend/framework/autodiff/test_task7.py`文件。切换到目录`frontend/framework/autodiff/`，直接运行相应的task文件，如果没有任何报错，说明你已经完成了这一关！🎉
\ No newline at end of file
diff --git a/lab-guide/03-framework.md b/lab-guide/03-framework.md
new file mode 100644
index 0000000..405a26b
--- /dev/null
+++ b/lab-guide/03-framework.md
@@ -0,0 +1,125 @@
+### 第三部分：进入人工智能的世界
+
+> 前两关是不是很简单？
+
+相信你在前两部分中，已经积累了足够多的C++知识，也回忆起了足够多的高等数学知识。现在，我们要构造一个框架，这个框架可以接受一个矩阵作为输入，并且支持神经网络中的常见的网络层，例如
+
+- 线性层（Linear）
+- 激活层（Activation）
+- 损失层（Loss）
+
+#### 张量类
+
+我们已经在`cc/tensor/tensor.h`中定义了张量类，这个类可以表示一个多维数组，并且支持常见的数学运算。我们可以在`cc/tensor/tensor.cc`中实现这些运算。当然，我们假定所有的张量都是二维的，这样你就不必考虑各种情况。
+
+**[TASK 13]** 补全`cc/tensor/tensor.cc`中关于`Tensor::transpose()`的函数实现。它能够将一个张量进行转置。
+
+**[TASK 14]** 补全`cc/tensor/tensor.cc`中关于`argmax(const std::shared_ptr<Tensor>& tensor, int axis)`的函数实现，它能够返回一个张量在指定维度上的最大值的索引。提示：你可以使用`std::numeric_limits<float>::infinity()`，可以通过LLM来查询它的含义。
+
+> 前面做了这么多次测试，你是不是该自己学会写测试了？...算了，还是我来帮你写吧...😂
+
+测试文件：`frontend/framework/tensor/task13_14.py`
+
+**关于测试用例** 之后的内容的测试用例可以参考`frontend/uct/test`下的文件，或依据自己的需要编写。
+
+#### 线性层
+
+线性层是神经网络中最为常见的网络层，它接受一个输入张量，并且输出一个张量。输入两个张量`feature: (batch_size x input_features)`和`weight: (input_features x output_features)`，输出张量`output: (batch_size x output_features)`，实际上就是将`feature`矩阵和`weight`矩阵相乘。
+
+用公式表示就是$y = Wx + b$。
+
+**[TASK 15]** 补全`cc/operators/nn.h`中`Linear`类的构造函数和`forward`函数。
+
+- 构造函数：构造函数接受两个参数`a`和`b`，它们都是`std::shared_ptr<Node>`类型的智能指针，分别表示输入特征和权重。构造函数调用基类`FunctionNode`的构造函数，并将`a`和`b`传递给它。在构造函数中，调用`this->forward()`方法，并将结果赋值给`this->data`。
+
+- `forward`函数：参见有关线性层的介绍。
+
+
+**[TASK 16]** 补全`cc/operators/nn.cc`中`Linear`类的`backward`函数。
+
+- `backward()`函数实现反向传播，计算梯度并返回。它接受`std::shared_ptr<tensor::Tensor> gradient`作为输入，你需要计算`grad_features`和`grad_weights`，它们分别表示对`features`和`weights`的梯度。
+
+> 数学Tips：`grad_features`是通过将`gradient`与`weights`的转置相乘得到的。`grad_weights`是通过将`features`的转置与`gradient`相乘得到的。
+
+完成了这两个任务后，你应该可以在`cc/`下执行
+
+```
+cmake -S . -B build
+cmake --build build
+```
+
+就能够编译你的代码。然后，你应当可以运行`frontend/uct/perception.py`，它将使用你实现的线性层来训练一个感知机。
+
+#### 激活层
+
+激活层是神经网络中常见的网络层，它接受一个输入张量，并且输出一个张量。输入一个张量`x`，输出一个张量`y`，实际上就是将`x`中的每个元素进行某种变换。
+
+用公式表示就是$y = f(x)$。对于`ReLU`函数来说，$y = max(0, x)$。
+
+**[TASK 17]** 补全`cc/operators/nn.h`中`ReLU`类的构造函数和`forward`函数。
+
+- 构造函数：构造函数接受一个参数`a`，它是一个`std::shared_ptr<Node>`类型的智能指针，表示输入特征。构造函数调用基类`FunctionNode`的构造函数，并将`a`传递给它。在构造函数中，调用`this->forward()`方法，并将结果赋值给`this->data`。
+
+- `forward`函数：参见有关激活层的介绍。
+
+**[TASK 18]** 补全`cc/operators/nn.cc`中`ReLU`类的`backward`函数。
+
+- `backward()`函数实现反向传播，计算梯度并返回。它接受`std::shared_ptr<tensor::Tensor> gradient`作为输入，你需要计算`grads`，它表示对`features`的梯度。
+
+> 数学Tips：`grads`是通过将`gradient`与`x`中大于0的元素对应相乘得到的。
+
+#### 偏置
+
+线性层中，我们没有实现偏置项`b`，它是一个向量，它的维度与输出特征的维度相同。偏置项的作用是使得线性层的输出能够更好地拟合数据。
+
+**[TASK 19]** 补全`cc/operators/nn.h`中`AddBias`类的构造函数和`forward`函数。
+
+- 构造函数：构造函数接受两个参数`a`和`b`，它们都是`std::shared_ptr<Node>`类型的智能指针，分别表示输入特征和偏置。构造函数调用基类`FunctionNode`的构造函数，并将`a`和`b`传递给它。在构造函数中，调用`this->forward()`方法，并将结果赋值给 `this->data`。
+
+- `forward`函数：`forward`方法实现前向传播，将偏置添加到输入特征上。`features`和`bias`分别从`this->objects`中获取，`features`的形状为`(batch_size x num_features)`，`bias`的形状为`(1 x num_features)`。在函数中，需要创建一个与`features`形状相同的输出张量`outNode`，使用嵌套循环将`features`的每个元素与`bias`的对应元素相加，结果存储在`outNode`中。最后，返回`outNode`。
+
+**[TASK 20]** 补全`cc/operators/nn.cc`中`AddBias`类的`backward`函数。
+
+- `backward()`函数实现反向传播，计算梯度并返回。它接受`std::shared_ptr<tensor::Tensor> gradient`作为输入，你需要计算`grad_features`和`grad_bias`，它们分别表示对`features`和`bias`的梯度。
+
+> 数学Tips：`grad_features`和`grad_bias`都是`gradient`的拷贝。但是考虑到我们有`batch_size`的存在，因此，在计算`bias`的梯度时，需要将`gradient`的每一列相加，得到`grad_bias`的对应元素。
+
+#### 损失层——均方误差损失函数
+
+我们首先实现均方误差损失函数，它接受两个张量`y_pred`和`y_true`，它们分别表示预测值和真实值，输出一个标量，表示预测值与真实值之间的误差。
+
+用公式表示就是$\displaystyle loss = \frac{1}{2} \sum_{i=1}^{n} (y_{pred} - y_{true})^2$。
+
+**[TASK 21]** 补全`cc/operators/nn.h`中`SquareLoss`类的构造函数和`forward`函数。
+
+- 构造函数：构造函数接受两个参数`a`和`b`，它们都是`std::shared_ptr<Node>`类型的智能指针，分别表示预测值和真实值。构造函数调用基类`FunctionNode`的构造函数，并将`a`和`b`传递给它。在构造函数中，调用`this->forward()`方法，并将结果赋值给`this->data`。
+
+- `forward`函数用于计算损失。
+
+**[TASK 22]** 补全`cc/operators/nn.cc`中`SquareLoss`类的`backward`函数。
+
+- `backward`函数计算损失函数相对于输入`a`和`b`的梯度。`gradient`是损失函数对输出的梯度（是一个形状为(1, 1)的张量，可以直接认为其是一个向量`g`）。`grad_a`和`grad_b`分别存储`a`和`b`的梯度。对于每个元素，梯度计算为`g * (a->data->data[i] - b->data->data[i]) / a->data->size`。最终返回 grad_a 和 grad_b 的向量。
+
+#### 损失层——SoftmaxLoss
+
+接下来，我们实现Softmax损失函数，它接受两个张量`y_pred`和`y_true`，它们分别表示预测值和真实值，输出一个标量，表示预测值与真实值之间的误差。
+
+用公式表示就是$\displaystyle loss = -\sum_{i=1}^{n} y_{true} \log(y_{pred})$。
+
+**[TASK 23]** 补全`cc/operators/nn.h`中`SoftmaxLoss`类的构造函数，`forward`函数和`backward`函数。
+
+完成上述内容后，你可以编译和运行`frontend/uct/regression.py`，使用线性网络来拟合`sin`函数。
+
+### 手写体识别
+
+补全代码中的其他标注有`TODO`的内容，最后编译运行，你就将能够训练一个手写体识别模型。可以运行`frontend/uct/mnist.py`来试一下吧！
+
+> 是不是觉得运行得有点慢？考虑使用多线程来加速矩阵运算。（这已经超出了这门课的要求，对高性能计算/并行计算感兴趣的同学可以勇于尝试！）
+
+### extra bonus
+
+想打副本？
+
+```
+nslookup -type=txt uc-cpp.shahe.org
+```
\ No newline at end of file