You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

embedding.py 19 kB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517
  1. #! /usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. import tensorlayer as tl
  4. from tensorlayer import logging
  5. from tensorlayer.layers.core import Module
  6. # from tensorlayer.layers.core import LayersConfig
  7. __all__ = ['OneHot', 'Word2vecEmbedding', 'Embedding', 'AverageEmbedding']
  8. class OneHot(Module):
  9. """
  10. The :class:`OneHot` class is the starting layer of a neural network, see ``tf.one_hot``.
  11. Useful link: `https://www.tensorflow.org/api_docs/python/tf/one_hot`.
  12. Parameters
  13. ----------
  14. depth : None or int
  15. If the input indices is rank N, the output will have rank N+1. The new axis is created at dimension `axis` (default: the new axis is appended at the end).
  16. on_value : None or number
  17. The value to represnt `ON`. If None, it will default to the value 1.
  18. off_value : None or number
  19. The value to represnt `OFF`. If None, it will default to the value 0.
  20. axis : None or int
  21. The axis.
  22. dtype : None or TensorFlow dtype
  23. The data type, None means tf.float32.
  24. name : str
  25. A unique layer name.
  26. Examples
  27. ---------
  28. >>> import tensorflow as tf
  29. >>> import tensorlayer as tl
  30. >>> net = tl.layers.Input([32], dtype=tl.int32)
  31. >>> onehot = tl.layers.OneHot(depth=8)
  32. >>> print(onehot)
  33. OneHot(depth=8, name='onehot')
  34. >>> tensor = tl.layers.OneHot(depth=8)(net)
  35. >>> print(tensor)
  36. tf.Tensor([...], shape=(32, 8), dtype=float32)
  37. """
  38. def __init__(self, depth=None, on_value=1.0, off_value=0.0, axis=-1, dtype=tl.float32, name=None):
  39. super(OneHot, self).__init__(name)
  40. self.depth = depth
  41. self.on_value = on_value
  42. self.off_value = off_value
  43. self.axis = axis
  44. self.dtype = dtype
  45. logging.info("OneHotInput %s" % (self.name))
  46. self.build()
  47. self._built = True
  48. if self.depth is None:
  49. raise RuntimeError(self.__class__.__name__ + ": depth == None the number of output units is undefined")
  50. def __repr__(self):
  51. s = ('{classname}(depth={depth}')
  52. if self.on_value is not None:
  53. s += ', on_value={on_value}'
  54. if self.off_value is not None:
  55. s += ', off_value={off_value}'
  56. if self.axis is not None:
  57. s += ', axis={axis}'
  58. if self.name is not None:
  59. s += ', name=\'{name}\''
  60. s += ')'
  61. return s.format(classname=self.__class__.__name__, **self.__dict__)
  62. def build(self, inputs_shape=None):
  63. self.onehot = tl.ops.OneHot(
  64. depth=self.depth, on_value=self.on_value, off_value=self.off_value, axis=self.axis, dtype=self.dtype
  65. )
  66. def forward(self, inputs):
  67. """
  68. Parameters
  69. ----------
  70. inputs : input tensor
  71. The inputs are indices. The locations represented by indices in indices take value on_value, while all other locations take value off_value.
  72. """
  73. outputs = self.onehot(inputs)
  74. return outputs
  75. class Word2vecEmbedding(Module):
  76. """
  77. The :class:`Word2vecEmbedding` class is a fully connected layer.
  78. For Word Embedding, words are input as integer index.
  79. The output is the embedded word vector.
  80. The layer integrates NCE loss by default (activate_nce_loss=True).
  81. If the NCE loss is activated, in a dynamic model,
  82. the computation of nce loss can be turned off in customised forward feeding
  83. by setting use_nce_loss=False when the layer is called.
  84. The NCE loss can be deactivated by setting activate_nce_loss=False.
  85. Parameters
  86. ----------
  87. vocabulary_size : int
  88. The size of vocabulary, number of words
  89. embedding_size : int
  90. The number of embedding dimensions
  91. num_sampled : int
  92. The number of negative examples for NCE loss
  93. activate_nce_loss : boolean
  94. Whether activate nce loss or not. By default, True
  95. If True, the layer will return both outputs of embedding and nce_cost in forward feeding.
  96. If False, the layer will only return outputs of embedding.
  97. In a dynamic model, the computation of nce loss can be turned off in forward feeding
  98. by setting use_nce_loss=False when the layer is called.
  99. In a static model, once the model is constructed, the computation of nce loss
  100. cannot be changed (always computed or not computed).
  101. nce_loss_args : dictionary
  102. The arguments for tf.ops.nce_loss()
  103. E_init : initializer
  104. The initializer for initializing the embedding matrix
  105. nce_W_init : initializer
  106. The initializer for initializing the nce decoder weight matrix
  107. nce_b_init : initializer
  108. The initializer for initializing of the nce decoder bias vector
  109. name : str
  110. A unique layer name
  111. Attributes
  112. ----------
  113. outputs : Tensor
  114. The embedding layer outputs.
  115. normalized_embeddings : Tensor
  116. Normalized embedding matrix.
  117. nce_weights : Tensor
  118. The NCE weights only when activate_nce_loss is True.
  119. nce_biases: Tensor
  120. The NCE biases only when activate_nce_loss is True.
  121. Examples
  122. --------
  123. Word2Vec With TensorLayer (Example in `examples/text_word_embedding/tutorial_word2vec_basic.py`)
  124. >>> import tensorflow as tf
  125. >>> import tensorlayer as tl
  126. >>> batch_size = 8
  127. >>> embedding_size = 50
  128. >>> inputs = tl.layers.Input([batch_size], dtype=tf.int32)
  129. >>> labels = tl.layers.Input([batch_size, 1], dtype=tf.int32)
  130. >>> emb_net = tl.layers.Word2vecEmbedding(
  131. >>> vocabulary_size=10000,
  132. >>> embedding_size=embedding_size,
  133. >>> num_sampled=100,
  134. >>> activate_nce_loss=True, # the nce loss is activated
  135. >>> nce_loss_args={},
  136. >>> E_init=tl.initializers.random_uniform(minval=-1.0, maxval=1.0),
  137. >>> nce_W_init=tl.initializers.truncated_normal(stddev=float(1.0 / np.sqrt(embedding_size))),
  138. >>> nce_b_init=tl.initializers.constant(value=0.0),
  139. >>> name='word2vec_layer',
  140. >>> )
  141. >>> print(emb_net)
  142. Word2vecEmbedding(vocabulary_size=10000, embedding_size=50, num_sampled=100, activate_nce_loss=True, nce_loss_args={})
  143. >>> embed_tensor = emb_net(inputs, use_nce_loss=False) # the nce loss is turned off and no need to provide labels
  144. >>> embed_tensor = emb_net([inputs, labels], use_nce_loss=False) # the nce loss is turned off and the labels will be ignored
  145. >>> embed_tensor, embed_nce_loss = emb_net([inputs, labels]) # the nce loss is calculated
  146. >>> outputs = tl.layers.Dense(n_units=10, name="dense")(embed_tensor)
  147. >>> model = tl.models.Model(inputs=[inputs, labels], outputs=[outputs, embed_nce_loss], name="word2vec_model") # a static model
  148. >>> out = model([data_x, data_y], is_train=True) # where data_x is inputs and data_y is labels
  149. References
  150. ----------
  151. `https://www.tensorflow.org/tutorials/representation/word2vec`
  152. """
  153. def __init__(
  154. self,
  155. vocabulary_size,
  156. embedding_size,
  157. num_sampled=64,
  158. activate_nce_loss=True,
  159. nce_loss_args=None,
  160. E_init=tl.initializers.random_uniform(minval=-1.0, maxval=1.0),
  161. nce_W_init=tl.initializers.truncated_normal(stddev=0.03),
  162. nce_b_init=tl.initializers.constant(value=0.0),
  163. name=None, #'word2vec',
  164. ):
  165. super(Word2vecEmbedding, self).__init__(name)
  166. self.vocabulary_size = vocabulary_size
  167. self.embedding_size = embedding_size
  168. self.num_sampled = num_sampled
  169. self.E_init = E_init
  170. self.activate_nce_loss = activate_nce_loss
  171. if self.activate_nce_loss:
  172. self.nce_loss_args = nce_loss_args
  173. self.nce_W_init = nce_W_init
  174. self.nce_b_init = nce_b_init
  175. if not self._built:
  176. self.build(tuple())
  177. self._built = True
  178. logging.info("Word2vecEmbedding %s: (%d, %d)" % (self.name, self.vocabulary_size, self.embedding_size))
  179. def __repr__(self):
  180. s = ('{classname}(')
  181. s += 'vocabulary_size={vocabulary_size}'
  182. s += ', embedding_size={embedding_size}'
  183. s += ', num_sampled={num_sampled}'
  184. s += ', activate_nce_loss={activate_nce_loss}'
  185. if self.activate_nce_loss:
  186. s += ', nce_loss_args={nce_loss_args}'
  187. s += ')'
  188. return s.format(classname=self.__class__.__name__, **self.__dict__)
  189. def build(self, inputs_shape):
  190. """
  191. Parameters
  192. ----------
  193. inputs_shape : tuple
  194. the shape of inputs tensor
  195. """
  196. # Look up embeddings for inputs.
  197. # Note: a row of 'embeddings' is the vector representation of a word.
  198. # for the sake of speed, it is better to slice the embedding matrix
  199. # instead of transferring a word id to one-hot-format vector and then
  200. # multiply by the embedding matrix.
  201. # embed is the outputs of the hidden layer (embedding layer), it is a
  202. # row vector with 'embedding_size' values.
  203. self.embeddings = self._get_weights(
  204. "embeddings",
  205. shape=(self.vocabulary_size, self.embedding_size),
  206. init=self.E_init,
  207. )
  208. self.normalized_embeddings = tl.L2Normalize(axis=1)(self.embeddings)
  209. if self.activate_nce_loss:
  210. # Construct the variables for the NCE loss (i.e. negative sampling)
  211. self.nce_weights = self._get_weights(
  212. "nce_weights",
  213. shape=(self.vocabulary_size, self.embedding_size),
  214. init=self.nce_W_init,
  215. )
  216. self.nce_biases = self._get_weights(
  217. "nce_biases",
  218. shape=(self.vocabulary_size, ),
  219. init=self.nce_b_init,
  220. )
  221. self.embedding_lookup = tl.EmbeddingLookup()
  222. if self.activate_nce_loss:
  223. self.nce_loss = tl.NCELoss(**self.nce_loss_args)
  224. def forward(self, inputs, use_nce_loss=None):
  225. """
  226. Parameters
  227. ----------
  228. inputs : tensor or list
  229. If the nce loss is activated and is used, the argument should be a list of two tensors [inputs, labels].
  230. Otherwise, the argument should be a single tensor which is inputs.
  231. use_nce_loss: boolean
  232. Whether use NCE loss in this run.
  233. If the nce loss is used, the activate_nce_loss should be True when the layer is initialized.
  234. By default, same as activate_nce_loss.
  235. Outputs:
  236. ----------
  237. outputs: tensor
  238. nce_cost: tensor
  239. The nce_cost is returned only if the nce_loss is used.
  240. """
  241. if isinstance(inputs, list):
  242. outputs = self.embedding_lookup(params=self.embeddings, ids=inputs[0])
  243. else:
  244. outputs = self.embedding_lookup(params=self.embeddings, ids=inputs)
  245. if use_nce_loss is True and not self.activate_nce_loss:
  246. raise AttributeError(
  247. "The nce loss is not activated when the %s is initialized. Please set activate_nce_loss=True." %
  248. self.__class__.__name__
  249. )
  250. if self.activate_nce_loss and (use_nce_loss is True or use_nce_loss is None):
  251. if not isinstance(inputs, list):
  252. raise ValueError("If nce loss is used, the labels of inputs must be provided.")
  253. nce_cost = tl.reduce_mean(
  254. input_tensor=self.nce_loss(
  255. weights=self.nce_weights, biases=self.nce_biases, inputs=outputs, labels=inputs[1],
  256. num_sampled=self.num_sampled, num_classes=self.vocabulary_size
  257. )
  258. )
  259. return outputs, nce_cost
  260. return outputs
  261. class Embedding(Module):
  262. """
  263. The :class:`Embedding` class is a look-up table for word embedding.
  264. Word content are accessed using integer indexes, then the output is the embedded word vector.
  265. To train a word embedding matrix, you can used :class:`Word2vecEmbedding`.
  266. If you have a pre-trained matrix, you can assign the parameters into it.
  267. Parameters
  268. ----------
  269. vocabulary_size : int
  270. The size of vocabulary, number of words.
  271. embedding_size : int
  272. The number of embedding dimensions.
  273. E_init : initializer
  274. The initializer for the embedding matrix.
  275. E_init_args : dictionary
  276. The arguments for embedding matrix initializer.
  277. name : str
  278. A unique layer name.
  279. Attributes
  280. ----------
  281. outputs : tensor
  282. The embedding layer output is a 3D tensor in the shape: (batch_size, num_steps(num_words), embedding_size).
  283. Examples
  284. --------
  285. >>> import tensorflow as tf
  286. >>> import tensorlayer as tl
  287. >>> input = tl.layers.Input([8, 100], dtype=tf.int32)
  288. >>> embed = tl.layers.Embedding(vocabulary_size=1000, embedding_size=50, name='embed')
  289. >>> print(embed)
  290. Embedding(vocabulary_size=1000, embedding_size=50)
  291. >>> tensor = embed(input)
  292. >>> print(tensor)
  293. tf.Tensor([...], shape=(8, 100, 50), dtype=float32)
  294. """
  295. def __init__(
  296. self,
  297. vocabulary_size,
  298. embedding_size,
  299. E_init=tl.initializers.random_uniform(-0.1, 0.1),
  300. name=None, #'embedding',
  301. ):
  302. super(Embedding, self).__init__(name)
  303. self.vocabulary_size = vocabulary_size
  304. self.embedding_size = embedding_size
  305. self.E_init = E_init
  306. if not self._built:
  307. self.build(tuple())
  308. self._built = True
  309. logging.info("Embedding %s: (%d, %d)" % (self.name, self.vocabulary_size, self.embedding_size))
  310. def __repr__(self):
  311. s = ('{classname}(')
  312. s += 'vocabulary_size={vocabulary_size}'
  313. s += ', embedding_size={embedding_size}'
  314. s += ')'
  315. return s.format(classname=self.__class__.__name__, **self.__dict__)
  316. def build(self, inputs_shape):
  317. """
  318. Parameters
  319. ----------
  320. inputs_shape : tuple
  321. the shape of inputs tensor
  322. """
  323. self.embeddings = self._get_weights(
  324. "embeddings",
  325. shape=(self.vocabulary_size, self.embedding_size),
  326. init=self.E_init,
  327. )
  328. self.embedding_lookup = tl.EmbeddingLookup()
  329. def forward(self, inputs):
  330. """
  331. Parameters
  332. ----------
  333. inputs : Tensor
  334. The input of a network.
  335. """
  336. outputs = self.embedding_lookup(params=self.embeddings, ids=inputs)
  337. return outputs
  338. class AverageEmbedding(Module):
  339. """The :class:`AverageEmbedding` averages over embeddings of inputs.
  340. This is often used as the input layer for models like DAN[1] and FastText[2].
  341. Parameters
  342. ----------
  343. vocabulary_size : int
  344. The size of vocabulary.
  345. embedding_size : int
  346. The dimension of the embedding vectors.
  347. pad_value : int
  348. The scalar padding value used in inputs, 0 as default.
  349. E_init : initializer
  350. The initializer of the embedding matrix.
  351. name : str
  352. A unique layer name.
  353. Attributes
  354. ----------
  355. outputs : tensor
  356. The embedding layer output is a 2D tensor in the shape: (batch_size, embedding_size).
  357. References
  358. ----------
  359. - [1] Iyyer, M., Manjunatha, V., Boyd-Graber, J., & Daum’e III, H. (2015). Deep Unordered Composition Rivals Syntactic Methods for Text Classification. In Association for Computational Linguistics.
  360. - [2] Joulin, A., Grave, E., Bojanowski, P., & Mikolov, T. (2016). `Bag of Tricks for Efficient Text Classification. <http://arxiv.org/abs/1607.01759>`__
  361. Examples
  362. ---------
  363. >>> import tensorflow as tf
  364. >>> import tensorlayer as tl
  365. >>> batch_size = 8
  366. >>> length = 5
  367. >>> input = tl.layers.Input([batch_size, length], dtype=tf.int32)
  368. >>> avgembed = tl.layers.AverageEmbedding(vocabulary_size=1000, embedding_size=50, name='avg')
  369. >>> print(avgembed)
  370. AverageEmbedding(vocabulary_size=1000, embedding_size=50, pad_value=0)
  371. >>> tensor = avgembed(input)
  372. >>> print(tensor)
  373. tf.Tensor([...], shape=(8, 50), dtype=float32)
  374. """
  375. def __init__(
  376. self,
  377. vocabulary_size,
  378. embedding_size,
  379. pad_value=0,
  380. E_init=tl.initializers.random_uniform(-0.1, 0.1),
  381. name=None, # 'average_embedding',
  382. ):
  383. super(AverageEmbedding, self).__init__(name)
  384. self.vocabulary_size = vocabulary_size
  385. self.embedding_size = embedding_size
  386. self.pad_value = pad_value
  387. self.E_init = E_init
  388. if not self._built:
  389. self.build(tuple())
  390. self._built = True
  391. logging.info("AverageEmbedding %s: (%d, %d)" % (self.name, self.vocabulary_size, self.embedding_size))
  392. def __repr__(self):
  393. s = ('{classname}(')
  394. s += 'vocabulary_size={vocabulary_size}'
  395. s += ', embedding_size={embedding_size}'
  396. s += ', pad_value={pad_value}'
  397. s += ')'
  398. return s.format(classname=self.__class__.__name__, **self.__dict__)
  399. def build(self, inputs_shape):
  400. """
  401. Parameters
  402. ----------
  403. inputs_shape : tuple
  404. the shape of inputs tensor.
  405. """
  406. # if len(inputs_shape) != 2:
  407. # raise ValueError('inputs must be of size (batch_size, sentence_length)')
  408. self.embeddings = self._get_weights(
  409. "embeddings",
  410. shape=(self.vocabulary_size, self.embedding_size),
  411. init=self.E_init,
  412. )
  413. self.embedding_lookup = tl.EmbeddingLookup()
  414. self.not_equal = tl.Not_equal()
  415. self.cast = tl.Cast(tl.float32)
  416. self.expand_dims = tl.ExpandDims(axis=-1)
  417. self.reduce_sum = tl.ReduceSum(axis=1)
  418. self.count_nonzero = tl.Count_nonzero(keepdims=True, dtype=tl.float32)
  419. def forward(self, inputs):
  420. """
  421. Parameters
  422. ----------
  423. inputs : tensor
  424. The network input.
  425. For word inputs, please use integer index format, 2D tensor: (batch_size, sentence_length).
  426. """
  427. word_embeddings = self.embedding_lookup(params=self.embeddings, ids=inputs)
  428. # Zero out embeddings of pad value
  429. masks = self.not_equal(inputs, self.pad_value)
  430. word_embeddings *= self.cast(self.expand_dims(masks))
  431. sum_word_embeddings = self.reduce_sum(input=word_embeddings)
  432. # Count number of non-padding words in each sentence
  433. sentence_lengths = self.count_nonzero(masks, axis=1)
  434. sentence_embeddings = tl.ops.divide(
  435. sum_word_embeddings,
  436. sentence_lengths + 1e-8, # Add epsilon to avoid dividing by 0
  437. )
  438. outputs = sentence_embeddings
  439. return outputs

TensorLayer3.0 是一款兼容多种深度学习框架为计算后端的深度学习库。计划兼容TensorFlow, Pytorch, MindSpore, Paddle.