import numpy as np

def format_shape(shape):
    return "x".join(map(str, shape)) if shape else "()"

class Node(object):
    def __repr__(self):
        return "<{} shape={} at {}>".format(
            type(self).__name__, format_shape(self.data.shape), hex(id(self)))

class DataNode(Node):
    """
    DataNode is the parent class for Parameter and Constant nodes.

    You should not need to use this class directly.
    """
    def __init__(self, data):
        self.parents = []
        self.data = data

    def _forward(self, *inputs):
        return self.data

    @staticmethod
    def _backward(gradient, *inputs):
        return []

class Parameter(DataNode):
    """
    A Parameter node stores parameters used in a neural network (or perceptron).

    Use the the `update` method to update parameters when training the
    perceptron or neural network.
    """
    def __init__(self, *shape):
        assert len(shape) == 2, (
            "Shape must have 2 dimensions, instead has {}".format(len(shape)))
        assert all(isinstance(dim, int) and dim > 0 for dim in shape), (
            "Shape must consist of positive integers, got {!r}".format(shape))
        limit = np.sqrt(3.0 / np.mean(shape))
        data = np.random.uniform(low=-limit, high=limit, size=shape)
        super().__init__(data)

    def update(self, direction, multiplier):
        assert isinstance(direction, Constant), (
            "Update direction must be a {} node, instead has type {!r}".format(
                Constant.__name__, type(direction).__name__))
        assert direction.data.shape == self.data.shape, (
            "Update direction shape {} does not match parameter shape "
            "{}".format(
                format_shape(direction.data.shape),
                format_shape(self.data.shape)))
        assert isinstance(multiplier, (int, float)), (
            "Multiplier must be a Python scalar, instead has type {!r}".format(
                type(multiplier).__name__))
        self.data += multiplier * direction.data
        assert np.all(np.isfinite(self.data)), (
            "Parameter contains NaN or infinity after update, cannot continue")

class Constant(DataNode):
    """
    A Constant node is used to represent:
    * Input features
    * Output labels
    * Gradients computed by back-propagation

    You should not need to construct any Constant nodes directly; they will
    instead be provided by either the dataset or when you call `nn.gradients`.
    """
    def __init__(self, data):
        assert isinstance(data, np.ndarray), (
            "Data should be a numpy array, instead has type {!r}".format(
                type(data).__name__))
        assert np.issubdtype(data.dtype, np.floating), (
            "Data should be a float array, instead has data type {!r}".format(
                data.dtype))
        super().__init__(data)

class FunctionNode(Node):
    """
    A FunctionNode represents a value that is computed based on other nodes.
    The FunctionNode class performs necessary book-keeping to compute gradients.
    """
    def __init__(self, *parents):
        assert all(isinstance(parent, Node) for parent in parents), (
            "Inputs must be node objects, instead got types {!r}".format(
                tuple(type(parent).__name__ for parent in parents)))
        self.parents = parents
        self.data = self._forward(*(parent.data for parent in parents))

class Add(FunctionNode):
    """
    Adds matrices element-wise.

    Usage: nn.Add(x, y)
    Inputs:
        x: a Node with shape (batch_size x num_features)
        y: a Node with the same shape as x
    Output:
        a Node with shape (batch_size x num_features)
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[0].shape == inputs[1].shape, (
            "Input shapes should match, instead got {} and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        return inputs[0] + inputs[1]

    @staticmethod
    def _backward(gradient, *inputs):
        assert gradient.shape == inputs[0].shape
        return [gradient, gradient]

class AddBias(FunctionNode):
    """
    Adds a bias vector to each feature vector

    Usage: nn.AddBias(features, bias)
    Inputs:
        features: a Node with shape (batch_size x num_features)
        bias: a Node with shape (1 x num_features)
    Output:
        a Node with shape (batch_size x num_features)
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[1].shape[0] == 1, (
            "First dimension of second input should be 1, instead got shape "
            "{}".format(format_shape(inputs[1].shape)))
        assert inputs[0].shape[1] == inputs[1].shape[1], (
            "Second dimension of inputs should match, instead got shapes {} "
            "and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        return inputs[0] + inputs[1]

    @staticmethod
    def _backward(gradient, *inputs):
        assert gradient.shape == inputs[0].shape
        return [gradient, np.sum(gradient, axis=0, keepdims=True)]

class DotProduct(FunctionNode):
    """
    Batched dot product

    Usage: nn.DotProduct(features, weights)
    Inputs:
        features: a Node with shape (batch_size x num_features)
        weights: a Node with shape (1 x num_features)
    Output: a Node with shape (batch_size x 1)
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[1].shape[0] == 1, (
            "First dimension of second input should be 1, instead got shape "
            "{}".format(format_shape(inputs[1].shape)))
        assert inputs[0].shape[1] == inputs[1].shape[1], (
            "Second dimension of inputs should match, instead got shapes {} "
            "and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        return np.dot(inputs[0], inputs[1].T)

    @staticmethod
    def _backward(gradient, *inputs):
        # assert gradient.shape[0] == inputs[0].shape[0]
        # assert gradient.shape[1] == 1
        # return [np.dot(gradient, inputs[1]), np.dot(gradient.T, inputs[0])]
        raise NotImplementedError(
            "Backpropagation through DotProduct nodes is not needed in this "
            "assignment")

class Linear(FunctionNode):
    """
    Applies a linear transformation (matrix multiplication) to the input

    Usage: nn.Linear(features, weights)
    Inputs:
        features: a Node with shape (batch_size x input_features)
        weights: a Node with shape (input_features x output_features)
    Output: a node with shape (batch_size x input_features)
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[0].shape[1] == inputs[1].shape[0], (
            "Second dimension of first input should match first dimension of "
            "second input, instead got shapes {} and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        return np.dot(inputs[0], inputs[1])

    @staticmethod
    def _backward(gradient, *inputs):
        assert gradient.shape[0] == inputs[0].shape[0]
        assert gradient.shape[1] == inputs[1].shape[1]
        return [np.dot(gradient, inputs[1].T), np.dot(inputs[0].T, gradient)]

class ReLU(FunctionNode):
    """
    An element-wise Rectified Linear Unit nonlinearity: max(x, 0).
    This nonlinearity replaces all negative entries in its input with zeros.

    Usage: nn.ReLU(x)
    Input:
        x: a Node with shape (batch_size x num_features)
    Output: a Node with the same shape as x, but no negative entries
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 1, "Expected 1 input, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "Input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        return np.maximum(inputs[0], 0)

    @staticmethod
    def _backward(gradient, *inputs):
        assert gradient.shape == inputs[0].shape
        return [gradient * np.where(inputs[0] > 0, 1.0, 0.0)]

class SquareLoss(FunctionNode):
    """
    This node first computes 0.5 * (a[i,j] - b[i,j])**2 at all positions (i,j)
    in the inputs, which creates a (batch_size x dim) matrix. It then calculates
    and returns the mean of all elements in this matrix.

    Usage: nn.SquareLoss(a, b)
    Inputs:
        a: a Node with shape (batch_size x dim)
        b: a Node with shape (batch_size x dim)
    Output: a scalar Node (containing a single floating-point number)
    """
    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[0].shape == inputs[1].shape, (
            "Input shapes should match, instead got {} and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        return np.mean(np.square(inputs[0] - inputs[1]) / 2)

    @staticmethod
    def _backward(gradient, *inputs):
        assert np.asarray(gradient).ndim == 0
        return [
            gradient * (inputs[0] - inputs[1]) / inputs[0].size,
            gradient * (inputs[1] - inputs[0]) / inputs[0].size
        ]

class SoftmaxLoss(FunctionNode):
    """
    A batched softmax loss, used for classification problems.

    IMPORTANT: do not swap the order of the inputs to this node!

    Usage: nn.SoftmaxLoss(logits, labels)
    Inputs:
        logits: a Node with shape (batch_size x num_classes). Each row
            represents the scores associated with that example belonging to a
            particular class. A score can be an arbitrary real number.
        labels: a Node with shape (batch_size x num_classes) that encodes the
            correct labels for the examples. All entries must be non-negative
            and the sum of values along each row should be 1.
    Output: a scalar Node (containing a single floating-point number)
    """
    @staticmethod
    def log_softmax(logits):
        log_probs = logits - np.max(logits, axis=1, keepdims=True)
        log_probs -= np.log(np.sum(np.exp(log_probs), axis=1, keepdims=True))
        return log_probs

    @staticmethod
    def _forward(*inputs):
        assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
        assert inputs[0].ndim == 2, (
            "First input should have 2 dimensions, instead has {}".format(
                inputs[0].ndim))
        assert inputs[1].ndim == 2, (
            "Second input should have 2 dimensions, instead has {}".format(
                inputs[1].ndim))
        assert inputs[0].shape == inputs[1].shape, (
            "Input shapes should match, instead got {} and {}".format(
                format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
        assert np.all(inputs[1] >= 0), (
            "All entries in the labels input must be non-negative")
        assert np.allclose(np.sum(inputs[1], axis=1), 1), (
            "Labels input must sum to 1 along each row")
        log_probs = SoftmaxLoss.log_softmax(inputs[0])
        return np.mean(-np.sum(inputs[1] * log_probs, axis=1))

    @staticmethod
    def _backward(gradient, *inputs):
        assert np.asarray(gradient).ndim == 0
        log_probs = SoftmaxLoss.log_softmax(inputs[0])
        return [
            gradient * (np.exp(log_probs) - inputs[1]) / inputs[0].shape[0],
            gradient * -log_probs / inputs[0].shape[0]
        ]

def gradients(loss, parameters):
    """
    Computes and returns the gradient of the loss with respect to the provided
    parameters.

    Usage: nn.gradients(loss, parameters)
    Inputs:
        loss: a SquareLoss or SoftmaxLoss node
        parameters: a list (or iterable) containing Parameter nodes
    Output: a list of Constant objects, representing the gradient of the loss
        with respect to each provided parameter.
    """

    assert isinstance(loss, (SquareLoss, SoftmaxLoss)), (
        "Loss must be a loss node, instead has type {!r}".format(
            type(loss).__name__))
    assert all(isinstance(parameter, Parameter) for parameter in parameters), (
        "Parameters must all have type {}, instead got types {!r}".format(
            Parameter.__name__,
            tuple(type(parameter).__name__ for parameter in parameters)))
    assert not hasattr(loss, "used"), (
        "Loss node has already been used for backpropagation, cannot reuse")

    loss.used = True

    nodes = set()
    tape = []

    def visit(node):
        if node not in nodes:
            for parent in node.parents:
                visit(parent)
            nodes.add(node)
            tape.append(node)

    visit(loss)
    nodes |= set(parameters)

    grads = {node: np.zeros_like(node.data) for node in nodes}
    grads[loss] = 1.0

    for node in reversed(tape):
        parent_grads = node._backward(
            grads[node], *(parent.data for parent in node.parents))
        for parent, parent_grad in zip(node.parents, parent_grads):
            grads[parent] += parent_grad

    return [Constant(grads[parameter]) for parameter in parameters]

def as_scalar(node):
    """
    Returns the value of a Node as a standard Python number. This only works
    for nodes with one element (e.g. SquareLoss and SoftmaxLoss, as well as
    DotProduct with a batch size of 1 element).
    """

    assert isinstance(node, Node), (
        "Input must be a node object, instead has type {!r}".format(
            type(node).__name__))
    assert node.data.size == 1, (
        "Node has shape {}, cannot convert to a scalar".format(
            format_shape(node.data.shape)))
    node.data = node.data.flatten()
    return node.data.tolist()[0]