import numpy as np def format_shape(shape): return "x".join(map(str, shape)) if shape else "()" class Node(object): def __repr__(self): return "<{} shape={} at {}>".format( type(self).__name__, format_shape(self.data.shape), hex(id(self))) class DataNode(Node): """ DataNode is the parent class for Parameter and Constant nodes. You should not need to use this class directly. """ def __init__(self, data): self.parents = [] self.data = data def _forward(self, *inputs): return self.data @staticmethod def _backward(gradient, *inputs): return [] class Parameter(DataNode): """ A Parameter node stores parameters used in a neural network (or perceptron). Use the the `update` method to update parameters when training the perceptron or neural network. """ def __init__(self, *shape): assert len(shape) == 2, ( "Shape must have 2 dimensions, instead has {}".format(len(shape))) assert all(isinstance(dim, int) and dim > 0 for dim in shape), ( "Shape must consist of positive integers, got {!r}".format(shape)) limit = np.sqrt(3.0 / np.mean(shape)) data = np.random.uniform(low=-limit, high=limit, size=shape) super().__init__(data) def update(self, direction, multiplier): assert isinstance(direction, Constant), ( "Update direction must be a {} node, instead has type {!r}".format( Constant.__name__, type(direction).__name__)) assert direction.data.shape == self.data.shape, ( "Update direction shape {} does not match parameter shape " "{}".format( format_shape(direction.data.shape), format_shape(self.data.shape))) assert isinstance(multiplier, (int, float)), ( "Multiplier must be a Python scalar, instead has type {!r}".format( type(multiplier).__name__)) self.data += multiplier * direction.data assert np.all(np.isfinite(self.data)), ( "Parameter contains NaN or infinity after update, cannot continue") class Constant(DataNode): """ A Constant node is used to represent: * Input features * Output labels * Gradients computed by back-propagation You should not need to construct any Constant nodes directly; they will instead be provided by either the dataset or when you call `nn.gradients`. """ def __init__(self, data): assert isinstance(data, np.ndarray), ( "Data should be a numpy array, instead has type {!r}".format( type(data).__name__)) assert np.issubdtype(data.dtype, np.floating), ( "Data should be a float array, instead has data type {!r}".format( data.dtype)) super().__init__(data) class FunctionNode(Node): """ A FunctionNode represents a value that is computed based on other nodes. The FunctionNode class performs necessary book-keeping to compute gradients. """ def __init__(self, *parents): assert all(isinstance(parent, Node) for parent in parents), ( "Inputs must be node objects, instead got types {!r}".format( tuple(type(parent).__name__ for parent in parents))) self.parents = parents self.data = self._forward(*(parent.data for parent in parents)) class Add(FunctionNode): """ Adds matrices element-wise. Usage: nn.Add(x, y) Inputs: x: a Node with shape (batch_size x num_features) y: a Node with the same shape as x Output: a Node with shape (batch_size x num_features) """ @staticmethod def _forward(*inputs): assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) assert inputs[0].ndim == 2, ( "First input should have 2 dimensions, instead has {}".format( inputs[0].ndim)) assert inputs[1].ndim == 2, ( "Second input should have 2 dimensions, instead has {}".format( inputs[1].ndim)) assert inputs[0].shape == inputs[1].shape, ( "Input shapes should match, instead got {} and {}".format( format_shape(inputs[0].shape), format_shape(inputs[1].shape))) return inputs[0] + inputs[1] @staticmethod def _backward(gradient, *inputs): assert gradient.shape == inputs[0].shape return [gradient, gradient] class AddBias(FunctionNode): """ Adds a bias vector to each feature vector Usage: nn.AddBias(features, bias) Inputs: features: a Node with shape (batch_size x num_features) bias: a Node with shape (1 x num_features) Output: a Node with shape (batch_size x num_features) """ @staticmethod def _forward(*inputs): assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) assert inputs[0].ndim == 2, ( "First input should have 2 dimensions, instead has {}".format( inputs[0].ndim)) assert inputs[1].ndim == 2, ( "Second input should have 2 dimensions, instead has {}".format( inputs[1].ndim)) assert inputs[1].shape[0] == 1, ( "First dimension of second input should be 1, instead got shape " "{}".format(format_shape(inputs[1].shape))) assert inputs[0].shape[1] == inputs[1].shape[1], ( "Second dimension of inputs should match, instead got shapes {} " "and {}".format( format_shape(inputs[0].shape), format_shape(inputs[1].shape))) return inputs[0] + inputs[1] @staticmethod def _backward(gradient, *inputs): assert gradient.shape == inputs[0].shape return [gradient, np.sum(gradient, axis=0, keepdims=True)] class DotProduct(FunctionNode): """ Batched dot product Usage: nn.DotProduct(features, weights) Inputs: features: a Node with shape (batch_size x num_features) weights: a Node with shape (1 x num_features) Output: a Node with shape (batch_size x 1) """ @staticmethod def _forward(*inputs): assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) assert inputs[0].ndim == 2, ( "First input should have 2 dimensions, instead has {}".format( inputs[0].ndim)) assert inputs[1].ndim == 2, ( "Second input should have 2 dimensions, instead has {}".format( inputs[1].ndim)) assert inputs[1].shape[0] == 1, ( "First dimension of second input should be 1, instead got shape " "{}".format(format_shape(inputs[1].shape))) assert inputs[0].shape[1] == inputs[1].shape[1], ( "Second dimension of inputs should match, instead got shapes {} " "and {}".format( format_shape(inputs[0].shape), format_shape(inputs[1].shape))) return np.dot(inputs[0], inputs[1].T) @staticmethod def _backward(gradient, *inputs): # assert gradient.shape[0] == inputs[0].shape[0] # assert gradient.shape[1] == 1 # return [np.dot(gradient, inputs[1]), np.dot(gradient.T, inputs[0])] raise NotImplementedError( "Backpropagation through DotProduct nodes is not needed in this " "assignment") class Linear(FunctionNode): """ Applies a linear transformation (matrix multiplication) to the input Usage: nn.Linear(features, weights) Inputs: features: a Node with shape (batch_size x input_features) weights: a Node with shape (input_features x output_features) Output: a node with shape (batch_size x input_features) """ @staticmethod def _forward(*inputs): assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) assert inputs[0].ndim == 2, ( "First input should have 2 dimensions, instead has {}".format( inputs[0].ndim)) assert inputs[1].ndim == 2, ( "Second input should have 2 dimensions, instead has {}".format( inputs[1].ndim)) assert inputs[0].shape[1] == inputs[1].shape[0], ( "Second dimension of first input should match first dimension of " "second input, instead got shapes {} and {}".format( format_shape(inputs[0].shape), format_shape(inputs[1].shape))) return np.dot(inputs[0], inputs[1]) @staticmethod def _backward(gradient, *inputs): assert gradient.shape[0] == inputs[0].shape[0] assert gradient.shape[1] == inputs[1].shape[1] return [np.dot(gradient, inputs[1].T), np.dot(inputs[0].T, gradient)] class ReLU(FunctionNode): """ An element-wise Rectified Linear Unit nonlinearity: max(x, 0). This nonlinearity replaces all negative entries in its input with zeros. Usage: nn.ReLU(x) Input: x: a Node with shape (batch_size x num_features) Output: a Node with the same shape as x, but no negative entries """ @staticmethod def _forward(*inputs): assert len(inputs) == 1, "Expected 1 input, got {}".format(len(inputs)) assert inputs[0].ndim == 2, ( "Input should have 2 dimensions, instead has {}".format( inputs[0].ndim)) return np.maximum(inputs[0], 0) @staticmethod def _backward(gradient, *inputs): assert gradient.shape == inputs[0].shape return [gradient * np.where(inputs[0] > 0, 1.0, 0.0)] class SquareLoss(FunctionNode): """ This node first computes 0.5 * (a[i,j] - b[i,j])**2 at all positions (i,j) in the inputs, which creates a (batch_size x dim) matrix. It then calculates and returns the mean of all elements in this matrix. Usage: nn.SquareLoss(a, b) Inputs: a: a Node with shape (batch_size x dim) b: a Node with shape (batch_size x dim) Output: a scalar Node (containing a single floating-point number) """ @staticmethod def _forward(*inputs): assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) assert inputs[0].ndim == 2, ( "First input should have 2 dimensions, instead has {}".format( inputs[0].ndim)) assert inputs[1].ndim == 2, ( "Second input should have 2 dimensions, instead has {}".format( inputs[1].ndim)) assert inputs[0].shape == inputs[1].shape, ( "Input shapes should match, instead got {} and {}".format( format_shape(inputs[0].shape), format_shape(inputs[1].shape))) return np.mean(np.square(inputs[0] - inputs[1]) / 2) @staticmethod def _backward(gradient, *inputs): assert np.asarray(gradient).ndim == 0 return [ gradient * (inputs[0] - inputs[1]) / inputs[0].size, gradient * (inputs[1] - inputs[0]) / inputs[0].size ] class SoftmaxLoss(FunctionNode): """ A batched softmax loss, used for classification problems. IMPORTANT: do not swap the order of the inputs to this node! Usage: nn.SoftmaxLoss(logits, labels) Inputs: logits: a Node with shape (batch_size x num_classes). Each row represents the scores associated with that example belonging to a particular class. A score can be an arbitrary real number. labels: a Node with shape (batch_size x num_classes) that encodes the correct labels for the examples. All entries must be non-negative and the sum of values along each row should be 1. Output: a scalar Node (containing a single floating-point number) """ @staticmethod def log_softmax(logits): log_probs = logits - np.max(logits, axis=1, keepdims=True) log_probs -= np.log(np.sum(np.exp(log_probs), axis=1, keepdims=True)) return log_probs @staticmethod def _forward(*inputs): assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs)) assert inputs[0].ndim == 2, ( "First input should have 2 dimensions, instead has {}".format( inputs[0].ndim)) assert inputs[1].ndim == 2, ( "Second input should have 2 dimensions, instead has {}".format( inputs[1].ndim)) assert inputs[0].shape == inputs[1].shape, ( "Input shapes should match, instead got {} and {}".format( format_shape(inputs[0].shape), format_shape(inputs[1].shape))) assert np.all(inputs[1] >= 0), ( "All entries in the labels input must be non-negative") assert np.allclose(np.sum(inputs[1], axis=1), 1), ( "Labels input must sum to 1 along each row") log_probs = SoftmaxLoss.log_softmax(inputs[0]) return np.mean(-np.sum(inputs[1] * log_probs, axis=1)) @staticmethod def _backward(gradient, *inputs): assert np.asarray(gradient).ndim == 0 log_probs = SoftmaxLoss.log_softmax(inputs[0]) return [ gradient * (np.exp(log_probs) - inputs[1]) / inputs[0].shape[0], gradient * -log_probs / inputs[0].shape[0] ] def gradients(loss, parameters): """ Computes and returns the gradient of the loss with respect to the provided parameters. Usage: nn.gradients(loss, parameters) Inputs: loss: a SquareLoss or SoftmaxLoss node parameters: a list (or iterable) containing Parameter nodes Output: a list of Constant objects, representing the gradient of the loss with respect to each provided parameter. """ assert isinstance(loss, (SquareLoss, SoftmaxLoss)), ( "Loss must be a loss node, instead has type {!r}".format( type(loss).__name__)) assert all(isinstance(parameter, Parameter) for parameter in parameters), ( "Parameters must all have type {}, instead got types {!r}".format( Parameter.__name__, tuple(type(parameter).__name__ for parameter in parameters))) assert not hasattr(loss, "used"), ( "Loss node has already been used for backpropagation, cannot reuse") loss.used = True nodes = set() tape = [] def visit(node): if node not in nodes: for parent in node.parents: visit(parent) nodes.add(node) tape.append(node) visit(loss) nodes |= set(parameters) grads = {node: np.zeros_like(node.data) for node in nodes} grads[loss] = 1.0 for node in reversed(tape): parent_grads = node._backward( grads[node], *(parent.data for parent in node.parents)) for parent, parent_grad in zip(node.parents, parent_grads): grads[parent] += parent_grad return [Constant(grads[parameter]) for parameter in parameters] def as_scalar(node): """ Returns the value of a Node as a standard Python number. This only works for nodes with one element (e.g. SquareLoss and SoftmaxLoss, as well as DotProduct with a batch size of 1 element). """ assert isinstance(node, Node), ( "Input must be a node object, instead has type {!r}".format( type(node).__name__)) assert node.data.size == 1, ( "Node has shape {}, cannot convert to a scalar".format( format_shape(node.data.shape))) node.data = node.data.flatten() return node.data.tolist()[0]