tensorlayer3/tensorlayer/layers/embedding.py

511 lines
18 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#! /usr/bin/python
# -*- coding: utf-8 -*-
import tensorlayer as tl
from tensorlayer import logging
from tensorlayer.layers.core import Module
__all__ = ['OneHot', 'Word2vecEmbedding', 'Embedding', 'AverageEmbedding']
class OneHot(Module):
"""
The :class:`OneHot` class is the starting layer of a neural network, see ``tf.one_hot``.
Useful link: `https://www.tensorflow.org/api_docs/python/tf/one_hot`.
Parameters
----------
depth : None or int
If the input indices is rank N, the output will have rank N+1. The new axis is created at dimension `axis` (default: the new axis is appended at the end).
on_value : None or number
The value to represnt `ON`. If None, it will default to the value 1.
off_value : None or number
The value to represnt `OFF`. If None, it will default to the value 0.
axis : None or int
The axis.
dtype : None or TensorFlow dtype
The data type, None means tl.float32.
name : str
A unique layer name.
Examples
---------
>>> net = tl.layers.Input([32], dtype=tl.int32)
>>> onehot = tl.layers.OneHot(depth=8)
>>> print(onehot)
OneHot(depth=8, name='onehot')
>>> tensor = tl.layers.OneHot(depth=8)(net)
>>> print(tensor)
Tensor([...], shape=(32, 8), dtype=float32)
"""
def __init__(self, depth=None, on_value=1.0, off_value=0.0, axis=-1, dtype=tl.float32, name=None):
super(OneHot, self).__init__(name)
self.depth = depth
self.on_value = on_value
self.off_value = off_value
self.axis = axis
self.dtype = dtype
logging.info("OneHotInput %s" % (self.name))
self.build()
self._built = True
if self.depth is None:
raise RuntimeError(self.__class__.__name__ + ": depth == None the number of output units is undefined")
def __repr__(self):
s = ('{classname}(depth={depth}')
if self.on_value is not None:
s += ', on_value={on_value}'
if self.off_value is not None:
s += ', off_value={off_value}'
if self.axis is not None:
s += ', axis={axis}'
if self.name is not None:
s += ', name=\'{name}\''
s += ')'
return s.format(classname=self.__class__.__name__, **self.__dict__)
def build(self, inputs_shape=None):
self.onehot = tl.ops.OneHot(
depth=self.depth, on_value=self.on_value, off_value=self.off_value, axis=self.axis, dtype=self.dtype
)
def forward(self, inputs):
"""
Parameters
----------
inputs : input tensor
The inputs are indices. The locations represented by indices in indices take value on_value, while all other locations take value off_value.
"""
outputs = self.onehot(inputs)
return outputs
class Word2vecEmbedding(Module):
"""
The :class:`Word2vecEmbedding` class is a fully connected layer.
For Word Embedding, words are input as integer index.
The output is the embedded word vector.
The layer integrates NCE loss by default (activate_nce_loss=True).
If the NCE loss is activated, in a dynamic model,
the computation of nce loss can be turned off in customised forward feeding
by setting use_nce_loss=False when the layer is called.
The NCE loss can be deactivated by setting activate_nce_loss=False.
Parameters
----------
vocabulary_size : int
The size of vocabulary, number of words
embedding_size : int
The number of embedding dimensions
num_sampled : int
The number of negative examples for NCE loss
activate_nce_loss : boolean
Whether activate nce loss or not. By default, True
If True, the layer will return both outputs of embedding and nce_cost in forward feeding.
If False, the layer will only return outputs of embedding.
In a dynamic model, the computation of nce loss can be turned off in forward feeding
by setting use_nce_loss=False when the layer is called.
In a static model, once the model is constructed, the computation of nce loss
cannot be changed (always computed or not computed).
nce_loss_args : dictionary
The arguments for tf.ops.nce_loss()
E_init : initializer
The initializer for initializing the embedding matrix
nce_W_init : initializer
The initializer for initializing the nce decoder weight matrix
nce_b_init : initializer
The initializer for initializing of the nce decoder bias vector
name : str
A unique layer name
Attributes
----------
outputs : Tensor
The embedding layer outputs.
normalized_embeddings : Tensor
Normalized embedding matrix.
nce_weights : Tensor
The NCE weights only when activate_nce_loss is True.
nce_biases: Tensor
The NCE biases only when activate_nce_loss is True.
Examples
--------
Word2Vec With TensorLayer (Example in `examples/text_word_embedding/tutorial_word2vec_basic.py`)
>>> import tensorlayer as tl
>>> batch_size = 8
>>> embedding_size = 50
>>> inputs = tl.layers.Input([batch_size], dtype=tl.int32)
>>> labels = tl.layers.Input([batch_size, 1], dtype=tl.int32)
>>> emb_net = tl.layers.Word2vecEmbedding(
>>> vocabulary_size=10000,
>>> embedding_size=embedding_size,
>>> num_sampled=100,
>>> activate_nce_loss=True, # the nce loss is activated
>>> nce_loss_args={},
>>> E_init=tl.initializers.random_uniform(minval=-1.0, maxval=1.0),
>>> nce_W_init=tl.initializers.truncated_normal(stddev=float(1.0 / np.sqrt(embedding_size))),
>>> nce_b_init=tl.initializers.constant(value=0.0),
>>> name='word2vec_layer',
>>> )
>>> print(emb_net)
Word2vecEmbedding(vocabulary_size=10000, embedding_size=50, num_sampled=100, activate_nce_loss=True, nce_loss_args={})
>>> embed_tensor = emb_net(inputs, use_nce_loss=False) # the nce loss is turned off and no need to provide labels
>>> embed_tensor = emb_net([inputs, labels], use_nce_loss=False) # the nce loss is turned off and the labels will be ignored
>>> embed_tensor, embed_nce_loss = emb_net([inputs, labels]) # the nce loss is calculated
>>> outputs = tl.layers.Dense(n_units=10, name="dense")(embed_tensor)
>>> model = tl.models.Model(inputs=[inputs, labels], outputs=[outputs, embed_nce_loss], name="word2vec_model") # a static model
>>> out = model([data_x, data_y], is_train=True) # where data_x is inputs and data_y is labels
References
----------
`https://www.tensorflow.org/tutorials/representation/word2vec`
"""
def __init__(
self,
vocabulary_size,
embedding_size,
num_sampled=64,
activate_nce_loss=True,
nce_loss_args=None,
E_init=tl.initializers.random_uniform(minval=-1.0, maxval=1.0),
nce_W_init=tl.initializers.truncated_normal(stddev=0.03),
nce_b_init=tl.initializers.constant(value=0.0),
name=None, #'word2vec',
):
super(Word2vecEmbedding, self).__init__(name)
self.vocabulary_size = vocabulary_size
self.embedding_size = embedding_size
self.num_sampled = num_sampled
self.E_init = E_init
self.activate_nce_loss = activate_nce_loss
if self.activate_nce_loss:
self.nce_loss_args = nce_loss_args
self.nce_W_init = nce_W_init
self.nce_b_init = nce_b_init
if not self._built:
self.build(tuple())
self._built = True
logging.info("Word2vecEmbedding %s: (%d, %d)" % (self.name, self.vocabulary_size, self.embedding_size))
def __repr__(self):
s = ('{classname}(')
s += 'vocabulary_size={vocabulary_size}'
s += ', embedding_size={embedding_size}'
s += ', num_sampled={num_sampled}'
s += ', activate_nce_loss={activate_nce_loss}'
if self.activate_nce_loss:
s += ', nce_loss_args={nce_loss_args}'
s += ')'
return s.format(classname=self.__class__.__name__, **self.__dict__)
def build(self, inputs_shape):
"""
Parameters
----------
inputs_shape : tuple
the shape of inputs tensor
"""
# Look up embeddings for inputs.
# Note: a row of 'embeddings' is the vector representation of a word.
# for the sake of speed, it is better to slice the embedding matrix
# instead of transferring a word id to one-hot-format vector and then
# multiply by the embedding matrix.
# embed is the outputs of the hidden layer (embedding layer), it is a
# row vector with 'embedding_size' values.
self.embeddings = self._get_weights(
"embeddings",
shape=(self.vocabulary_size, self.embedding_size),
init=self.E_init,
)
self.normalized_embeddings = tl.L2Normalize(axis=1)(self.embeddings)
if self.activate_nce_loss:
# Construct the variables for the NCE loss (i.e. negative sampling)
self.nce_weights = self._get_weights(
"nce_weights",
shape=(self.vocabulary_size, self.embedding_size),
init=self.nce_W_init,
)
self.nce_biases = self._get_weights(
"nce_biases",
shape=(self.vocabulary_size, ),
init=self.nce_b_init,
)
self.embedding_lookup = tl.EmbeddingLookup()
if self.activate_nce_loss:
self.nce_loss = tl.NCELoss(**self.nce_loss_args)
def forward(self, inputs, use_nce_loss=None):
"""
Parameters
----------
inputs : tensor or list
If the nce loss is activated and is used, the argument should be a list of two tensors [inputs, labels].
Otherwise, the argument should be a single tensor which is inputs.
use_nce_loss: boolean
Whether use NCE loss in this run.
If the nce loss is used, the activate_nce_loss should be True when the layer is initialized.
By default, same as activate_nce_loss.
Outputs:
----------
outputs: tensor
nce_cost: tensor
The nce_cost is returned only if the nce_loss is used.
"""
if isinstance(inputs, list):
outputs = self.embedding_lookup(params=self.embeddings, ids=inputs[0])
else:
outputs = self.embedding_lookup(params=self.embeddings, ids=inputs)
if use_nce_loss is True and not self.activate_nce_loss:
raise AttributeError(
"The nce loss is not activated when the %s is initialized. Please set activate_nce_loss=True." %
self.__class__.__name__
)
if self.activate_nce_loss and (use_nce_loss is True or use_nce_loss is None):
if not isinstance(inputs, list):
raise ValueError("If nce loss is used, the labels of inputs must be provided.")
nce_cost = tl.reduce_mean(
input_tensor=self.nce_loss(
weights=self.nce_weights, biases=self.nce_biases, inputs=outputs, labels=inputs[1],
num_sampled=self.num_sampled, num_classes=self.vocabulary_size
)
)
return outputs, nce_cost
return outputs
class Embedding(Module):
"""
The :class:`Embedding` class is a look-up table for word embedding.
Word content are accessed using integer indexes, then the output is the embedded word vector.
To train a word embedding matrix, you can used :class:`Word2vecEmbedding`.
If you have a pre-trained matrix, you can assign the parameters into it.
Parameters
----------
vocabulary_size : int
The size of vocabulary, number of words.
embedding_size : int
The number of embedding dimensions.
E_init : initializer
The initializer for the embedding matrix.
E_init_args : dictionary
The arguments for embedding matrix initializer.
name : str
A unique layer name.
Attributes
----------
outputs : tensor
The embedding layer output is a 3D tensor in the shape: (batch_size, num_steps(num_words), embedding_size).
Examples
--------
>>> import tensorlayer as tl
>>> input = tl.layers.Input([8, 100], dtype=tl.int32)
>>> embed = tl.layers.Embedding(vocabulary_size=1000, embedding_size=50, name='embed')
>>> print(embed)
Embedding(vocabulary_size=1000, embedding_size=50)
>>> tensor = embed(input)
>>> print(tensor)
Tensor([...], shape=(8, 100, 50), dtype=float32)
"""
def __init__(
self,
vocabulary_size,
embedding_size,
E_init=tl.initializers.random_uniform(-0.1, 0.1),
name=None, #'embedding',
):
super(Embedding, self).__init__(name)
self.vocabulary_size = vocabulary_size
self.embedding_size = embedding_size
self.E_init = E_init
if not self._built:
self.build(tuple())
self._built = True
logging.info("Embedding %s: (%d, %d)" % (self.name, self.vocabulary_size, self.embedding_size))
def __repr__(self):
s = ('{classname}(')
s += 'vocabulary_size={vocabulary_size}'
s += ', embedding_size={embedding_size}'
s += ')'
return s.format(classname=self.__class__.__name__, **self.__dict__)
def build(self, inputs_shape):
"""
Parameters
----------
inputs_shape : tuple
the shape of inputs tensor
"""
self.embeddings = self._get_weights(
"embeddings",
shape=(self.vocabulary_size, self.embedding_size),
init=self.E_init,
)
self.embedding_lookup = tl.EmbeddingLookup()
def forward(self, inputs):
"""
Parameters
----------
inputs : Tensor
The input of a network.
"""
outputs = self.embedding_lookup(params=self.embeddings, ids=inputs)
return outputs
class AverageEmbedding(Module):
"""The :class:`AverageEmbedding` averages over embeddings of inputs.
This is often used as the input layer for models like DAN[1] and FastText[2].
Parameters
----------
vocabulary_size : int
The size of vocabulary.
embedding_size : int
The dimension of the embedding vectors.
pad_value : int
The scalar padding value used in inputs, 0 as default.
E_init : initializer
The initializer of the embedding matrix.
name : str
A unique layer name.
Attributes
----------
outputs : tensor
The embedding layer output is a 2D tensor in the shape: (batch_size, embedding_size).
References
----------
- [1] Iyyer, M., Manjunatha, V., Boyd-Graber, J., & Daume III, H. (2015). Deep Unordered Composition Rivals Syntactic Methods for Text Classification. In Association for Computational Linguistics.
- [2] Joulin, A., Grave, E., Bojanowski, P., & Mikolov, T. (2016). `Bag of Tricks for Efficient Text Classification. <http://arxiv.org/abs/1607.01759>`__
Examples
---------
>>> import tensorlayer as tl
>>> batch_size = 8
>>> length = 5
>>> input = tl.layers.Input([batch_size, length], dtype=tl.int32)
>>> avgembed = tl.layers.AverageEmbedding(vocabulary_size=1000, embedding_size=50, name='avg')
>>> print(avgembed)
AverageEmbedding(vocabulary_size=1000, embedding_size=50, pad_value=0)
>>> tensor = avgembed(input)
>>> print(tensor)
Tensor([...], shape=(8, 50), dtype=float32)
"""
def __init__(
self,
vocabulary_size,
embedding_size,
pad_value=0,
E_init=tl.initializers.random_uniform(-0.1, 0.1),
name=None, # 'average_embedding',
):
super(AverageEmbedding, self).__init__(name)
self.vocabulary_size = vocabulary_size
self.embedding_size = embedding_size
self.pad_value = pad_value
self.E_init = E_init
if not self._built:
self.build(tuple())
self._built = True
logging.info("AverageEmbedding %s: (%d, %d)" % (self.name, self.vocabulary_size, self.embedding_size))
def __repr__(self):
s = ('{classname}(')
s += 'vocabulary_size={vocabulary_size}'
s += ', embedding_size={embedding_size}'
s += ', pad_value={pad_value}'
s += ')'
return s.format(classname=self.__class__.__name__, **self.__dict__)
def build(self, inputs_shape):
"""
Parameters
----------
inputs_shape : tuple
the shape of inputs tensor.
"""
# if len(inputs_shape) != 2:
# raise ValueError('inputs must be of size (batch_size, sentence_length)')
self.embeddings = self._get_weights(
"embeddings",
shape=(self.vocabulary_size, self.embedding_size),
init=self.E_init,
)
self.embedding_lookup = tl.EmbeddingLookup()
self.not_equal = tl.NotEqual()
self.cast = tl.Cast(tl.float32)
self.expand_dims = tl.ExpandDims(axis=-1)
self.reduce_sum = tl.ReduceSum(axis=1)
self.count_nonzero = tl.CountNonzero(keepdims=True, dtype=tl.float32)
def forward(self, inputs):
"""
Parameters
----------
inputs : tensor
The network input.
For word inputs, please use integer index format, 2D tensor: (batch_size, sentence_length).
"""
word_embeddings = self.embedding_lookup(params=self.embeddings, ids=inputs)
# Zero out embeddings of pad value
masks = self.not_equal(inputs, self.pad_value)
word_embeddings *= self.cast(self.expand_dims(masks))
sum_word_embeddings = self.reduce_sum(input=word_embeddings)
# Count number of non-padding words in each sentence
sentence_lengths = self.count_nonzero(masks, axis=1)
print(masks, sentence_lengths)
sentence_embeddings = tl.ops.divide(
sum_word_embeddings,
sentence_lengths + 1e-8, # Add epsilon to avoid dividing by 0
)
outputs = sentence_embeddings
return outputs