You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. import numpy as np
  2. np.random.seed(42)
  3. def format_shape(shape):
  4. return "x".join(map(str, shape)) if shape else "()"
  5. class Node(object):
  6. def __repr__(self):
  7. return "<{} shape={} at {}>".format(
  8. type(self).__name__, format_shape(self.data.shape), hex(id(self)))
  9. class DataNode(Node):
  10. """
  11. DataNode is the parent class for Parameter and Constant nodes.
  12. You should not need to use this class directly.
  13. """
  14. def __init__(self, data):
  15. self.parents = []
  16. self.data = data
  17. def _forward(self, *inputs):
  18. return self.data
  19. @staticmethod
  20. def _backward(gradient, *inputs):
  21. return []
  22. class Parameter(DataNode):
  23. """
  24. A Parameter node stores parameters used in a neural network (or perceptron).
  25. Use the the `update` method to update parameters when training the
  26. perceptron or neural network.
  27. """
  28. def __init__(self, *shape):
  29. assert len(shape) == 2, (
  30. "Shape must have 2 dimensions, instead has {}".format(len(shape)))
  31. assert all(isinstance(dim, int) and dim > 0 for dim in shape), (
  32. "Shape must consist of positive integers, got {!r}".format(shape))
  33. limit = np.sqrt(3.0 / np.mean(shape))
  34. data = np.random.uniform(low=-limit, high=limit, size=shape)
  35. super().__init__(data)
  36. def update(self, direction, multiplier):
  37. assert isinstance(direction, Constant), (
  38. "Update direction must be a {} node, instead has type {!r}".format(
  39. Constant.__name__, type(direction).__name__))
  40. assert direction.data.shape == self.data.shape, (
  41. "Update direction shape {} does not match parameter shape "
  42. "{}".format(
  43. format_shape(direction.data.shape),
  44. format_shape(self.data.shape)))
  45. assert isinstance(multiplier, (int, float)), (
  46. "Multiplier must be a Python scalar, instead has type {!r}".format(
  47. type(multiplier).__name__))
  48. self.data += multiplier * direction.data
  49. assert np.all(np.isfinite(self.data)), (
  50. "Parameter contains NaN or infinity after update, cannot continue")
  51. class Constant(DataNode):
  52. """
  53. A Constant node is used to represent:
  54. * Input features
  55. * Output labels
  56. * Gradients computed by back-propagation
  57. You should not need to construct any Constant nodes directly; they will
  58. instead be provided by either the dataset or when you call `nn.gradients`.
  59. """
  60. def __init__(self, data):
  61. assert isinstance(data, np.ndarray), (
  62. "Data should be a numpy array, instead has type {!r}".format(
  63. type(data).__name__))
  64. assert np.issubdtype(data.dtype, np.floating), (
  65. "Data should be a float array, instead has data type {!r}".format(
  66. data.dtype))
  67. super().__init__(data)
  68. class FunctionNode(Node):
  69. """
  70. A FunctionNode represents a value that is computed based on other nodes.
  71. The FunctionNode class performs necessary book-keeping to compute gradients.
  72. """
  73. def __init__(self, *parents):
  74. assert all(isinstance(parent, Node) for parent in parents), (
  75. "Inputs must be node objects, instead got types {!r}".format(
  76. tuple(type(parent).__name__ for parent in parents)))
  77. self.parents = parents
  78. self.data = self._forward(*(parent.data for parent in parents))
  79. class Add(FunctionNode):
  80. """
  81. Adds matrices element-wise.
  82. Usage: nn.Add(x, y)
  83. Inputs:
  84. x: a Node with shape (batch_size x num_features)
  85. y: a Node with the same shape as x
  86. Output:
  87. a Node with shape (batch_size x num_features)
  88. """
  89. @staticmethod
  90. def _forward(*inputs):
  91. assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
  92. assert inputs[0].ndim == 2, (
  93. "First input should have 2 dimensions, instead has {}".format(
  94. inputs[0].ndim))
  95. assert inputs[1].ndim == 2, (
  96. "Second input should have 2 dimensions, instead has {}".format(
  97. inputs[1].ndim))
  98. assert inputs[0].shape == inputs[1].shape, (
  99. "Input shapes should match, instead got {} and {}".format(
  100. format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
  101. return inputs[0] + inputs[1]
  102. @staticmethod
  103. def _backward(gradient, *inputs):
  104. assert gradient.shape == inputs[0].shape
  105. return [gradient, gradient]
  106. class AddBias(FunctionNode):
  107. """
  108. Adds a bias vector to each feature vector
  109. Usage: nn.AddBias(features, bias)
  110. Inputs:
  111. features: a Node with shape (batch_size x num_features)
  112. bias: a Node with shape (1 x num_features)
  113. Output:
  114. a Node with shape (batch_size x num_features)
  115. """
  116. @staticmethod
  117. def _forward(*inputs):
  118. assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
  119. assert inputs[0].ndim == 2, (
  120. "First input should have 2 dimensions, instead has {}".format(
  121. inputs[0].ndim))
  122. assert inputs[1].ndim == 2, (
  123. "Second input should have 2 dimensions, instead has {}".format(
  124. inputs[1].ndim))
  125. assert inputs[1].shape[0] == 1, (
  126. "First dimension of second input should be 1, instead got shape "
  127. "{}".format(format_shape(inputs[1].shape)))
  128. assert inputs[0].shape[1] == inputs[1].shape[1], (
  129. "Second dimension of inputs should match, instead got shapes {} "
  130. "and {}".format(
  131. format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
  132. return inputs[0] + inputs[1]
  133. @staticmethod
  134. def _backward(gradient, *inputs):
  135. assert gradient.shape == inputs[0].shape
  136. return [gradient, np.sum(gradient, axis=0, keepdims=True)]
  137. class DotProduct(FunctionNode):
  138. """
  139. Batched dot product
  140. Usage: nn.DotProduct(features, weights)
  141. Inputs:
  142. features: a Node with shape (batch_size x num_features)
  143. weights: a Node with shape (1 x num_features)
  144. Output: a Node with shape (batch_size x 1)
  145. """
  146. @staticmethod
  147. def _forward(*inputs):
  148. assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
  149. assert inputs[0].ndim == 2, (
  150. "First input should have 2 dimensions, instead has {}".format(
  151. inputs[0].ndim))
  152. assert inputs[1].ndim == 2, (
  153. "Second input should have 2 dimensions, instead has {}".format(
  154. inputs[1].ndim))
  155. assert inputs[1].shape[0] == 1, (
  156. "First dimension of second input should be 1, instead got shape "
  157. "{}".format(format_shape(inputs[1].shape)))
  158. assert inputs[0].shape[1] == inputs[1].shape[1], (
  159. "Second dimension of inputs should match, instead got shapes {} "
  160. "and {}".format(
  161. format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
  162. return np.dot(inputs[0], inputs[1].T)
  163. @staticmethod
  164. def _backward(gradient, *inputs):
  165. # assert gradient.shape[0] == inputs[0].shape[0]
  166. # assert gradient.shape[1] == 1
  167. # return [np.dot(gradient, inputs[1]), np.dot(gradient.T, inputs[0])]
  168. raise NotImplementedError(
  169. "Backpropagation through DotProduct nodes is not needed in this "
  170. "assignment")
  171. class Linear(FunctionNode):
  172. """
  173. Applies a linear transformation (matrix multiplication) to the input
  174. Usage: nn.Linear(features, weights)
  175. Inputs:
  176. features: a Node with shape (batch_size x input_features)
  177. weights: a Node with shape (input_features x output_features)
  178. Output: a node with shape (batch_size x input_features)
  179. """
  180. @staticmethod
  181. def _forward(*inputs):
  182. assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
  183. assert inputs[0].ndim == 2, (
  184. "First input should have 2 dimensions, instead has {}".format(
  185. inputs[0].ndim))
  186. assert inputs[1].ndim == 2, (
  187. "Second input should have 2 dimensions, instead has {}".format(
  188. inputs[1].ndim))
  189. assert inputs[0].shape[1] == inputs[1].shape[0], (
  190. "Second dimension of first input should match first dimension of "
  191. "second input, instead got shapes {} and {}".format(
  192. format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
  193. return np.dot(inputs[0], inputs[1])
  194. @staticmethod
  195. def _backward(gradient, *inputs):
  196. assert gradient.shape[0] == inputs[0].shape[0]
  197. assert gradient.shape[1] == inputs[1].shape[1]
  198. return [np.dot(gradient, inputs[1].T), np.dot(inputs[0].T, gradient)]
  199. class ReLU(FunctionNode):
  200. """
  201. An element-wise Rectified Linear Unit nonlinearity: max(x, 0).
  202. This nonlinearity replaces all negative entries in its input with zeros.
  203. Usage: nn.ReLU(x)
  204. Input:
  205. x: a Node with shape (batch_size x num_features)
  206. Output: a Node with the same shape as x, but no negative entries
  207. """
  208. @staticmethod
  209. def _forward(*inputs):
  210. assert len(inputs) == 1, "Expected 1 input, got {}".format(len(inputs))
  211. assert inputs[0].ndim == 2, (
  212. "Input should have 2 dimensions, instead has {}".format(
  213. inputs[0].ndim))
  214. return np.maximum(inputs[0], 0)
  215. @staticmethod
  216. def _backward(gradient, *inputs):
  217. assert gradient.shape == inputs[0].shape
  218. return [gradient * np.where(inputs[0] > 0, 1.0, 0.0)]
  219. class SquareLoss(FunctionNode):
  220. """
  221. This node first computes 0.5 * (a[i,j] - b[i,j])**2 at all positions (i,j)
  222. in the inputs, which creates a (batch_size x dim) matrix. It then calculates
  223. and returns the mean of all elements in this matrix.
  224. Usage: nn.SquareLoss(a, b)
  225. Inputs:
  226. a: a Node with shape (batch_size x dim)
  227. b: a Node with shape (batch_size x dim)
  228. Output: a scalar Node (containing a single floating-point number)
  229. """
  230. @staticmethod
  231. def _forward(*inputs):
  232. assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
  233. assert inputs[0].ndim == 2, (
  234. "First input should have 2 dimensions, instead has {}".format(
  235. inputs[0].ndim))
  236. assert inputs[1].ndim == 2, (
  237. "Second input should have 2 dimensions, instead has {}".format(
  238. inputs[1].ndim))
  239. assert inputs[0].shape == inputs[1].shape, (
  240. "Input shapes should match, instead got {} and {}".format(
  241. format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
  242. return np.mean(np.square(inputs[0] - inputs[1]) / 2)
  243. @staticmethod
  244. def _backward(gradient, *inputs):
  245. assert np.asarray(gradient).ndim == 0
  246. return [
  247. gradient * (inputs[0] - inputs[1]) / inputs[0].size,
  248. gradient * (inputs[1] - inputs[0]) / inputs[0].size
  249. ]
  250. class SoftmaxLoss(FunctionNode):
  251. """
  252. A batched softmax loss, used for classification problems.
  253. IMPORTANT: do not swap the order of the inputs to this node!
  254. Usage: nn.SoftmaxLoss(logits, labels)
  255. Inputs:
  256. logits: a Node with shape (batch_size x num_classes). Each row
  257. represents the scores associated with that example belonging to a
  258. particular class. A score can be an arbitrary real number.
  259. labels: a Node with shape (batch_size x num_classes) that encodes the
  260. correct labels for the examples. All entries must be non-negative
  261. and the sum of values along each row should be 1.
  262. Output: a scalar Node (containing a single floating-point number)
  263. """
  264. @staticmethod
  265. def log_softmax(logits):
  266. log_probs = logits - np.max(logits, axis=1, keepdims=True)
  267. log_probs -= np.log(np.sum(np.exp(log_probs), axis=1, keepdims=True))
  268. return log_probs
  269. @staticmethod
  270. def _forward(*inputs):
  271. assert len(inputs) == 2, "Expected 2 inputs, got {}".format(len(inputs))
  272. assert inputs[0].ndim == 2, (
  273. "First input should have 2 dimensions, instead has {}".format(
  274. inputs[0].ndim))
  275. assert inputs[1].ndim == 2, (
  276. "Second input should have 2 dimensions, instead has {}".format(
  277. inputs[1].ndim))
  278. assert inputs[0].shape == inputs[1].shape, (
  279. "Input shapes should match, instead got {} and {}".format(
  280. format_shape(inputs[0].shape), format_shape(inputs[1].shape)))
  281. assert np.all(inputs[1] >= 0), (
  282. "All entries in the labels input must be non-negative")
  283. assert np.allclose(np.sum(inputs[1], axis=1), 1), (
  284. "Labels input must sum to 1 along each row")
  285. log_probs = SoftmaxLoss.log_softmax(inputs[0])
  286. return np.mean(-np.sum(inputs[1] * log_probs, axis=1))
  287. @staticmethod
  288. def _backward(gradient, *inputs):
  289. assert np.asarray(gradient).ndim == 0
  290. log_probs = SoftmaxLoss.log_softmax(inputs[0])
  291. return [
  292. gradient * (np.exp(log_probs) - inputs[1]) / inputs[0].shape[0],
  293. gradient * -log_probs / inputs[0].shape[0]
  294. ]
  295. def gradients(loss, parameters):
  296. """
  297. Computes and returns the gradient of the loss with respect to the provided
  298. parameters.
  299. Usage: nn.gradients(loss, parameters)
  300. Inputs:
  301. loss: a SquareLoss or SoftmaxLoss node
  302. parameters: a list (or iterable) containing Parameter nodes
  303. Output: a list of Constant objects, representing the gradient of the loss
  304. with respect to each provided parameter.
  305. """
  306. assert isinstance(loss, (SquareLoss, SoftmaxLoss)), (
  307. "Loss must be a loss node, instead has type {!r}".format(
  308. type(loss).__name__))
  309. assert all(isinstance(parameter, Parameter) for parameter in parameters), (
  310. "Parameters must all have type {}, instead got types {!r}".format(
  311. Parameter.__name__,
  312. tuple(type(parameter).__name__ for parameter in parameters)))
  313. assert not hasattr(loss, "used"), (
  314. "Loss node has already been used for backpropagation, cannot reuse")
  315. loss.used = True
  316. nodes = set()
  317. tape = []
  318. def visit(node):
  319. if node not in nodes:
  320. for parent in node.parents:
  321. visit(parent)
  322. nodes.add(node)
  323. tape.append(node)
  324. visit(loss)
  325. nodes |= set(parameters)
  326. grads = {node: np.zeros_like(node.data) for node in nodes}
  327. grads[loss] = 1.0
  328. for node in reversed(tape):
  329. parent_grads = node._backward(
  330. grads[node], *(parent.data for parent in node.parents))
  331. for parent, parent_grad in zip(node.parents, parent_grads):
  332. grads[parent] += parent_grad
  333. return [Constant(grads[parameter]) for parameter in parameters]
  334. def as_scalar(node):
  335. """
  336. Returns the value of a Node as a standard Python number. This only works
  337. for nodes with one element (e.g. SquareLoss and SoftmaxLoss, as well as
  338. DotProduct with a batch size of 1 element).
  339. """
  340. assert isinstance(node, Node), (
  341. "Input must be a node object, instead has type {!r}".format(
  342. type(node).__name__))
  343. assert node.data.size == 1, (
  344. "Node has shape {}, cannot convert to a scalar".format(
  345. format_shape(node.data.shape)))
  346. node.data = node.data.flatten()
  347. return node.data.tolist()[0]