tensorlayer3/tensorlayer/layers/recurrent.py

#! /usr/bin/python
# -*- coding: utf-8 -*-

import numpy as np
import tensorlayer as tl
from tensorlayer import logging
from tensorlayer.backend.ops.load_backend import BACKEND
from tensorlayer.layers.core import Module

__all__ = [
    'RNN',
    'RNNCell',
    'GRU',
    'LSTM',
    'GRUCell',
    'LSTMCell',
]


class RNNCell(Module):
    """An Elman RNN cell with tanh or ReLU non-linearity.

    Parameters
    ----------
    input_size : int
        The number of expected features in the input `x`
    hidden_size : int
        The number of features in the hidden state `h`
    bias : bool
        If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. Default: ``True``
    act : activation function
        The non-linearity to use. Can be either 'tanh' or 'relu'. Default: 'tanh'
    name : None or str
        A unique layer name
    --------------------------------------------------------
    inputs : tensor
        A tensor with shape `[batch_size, input_size]`.
    states : tensor or None
        A tensor with shape `[batch_size, hidden_size]`. When states is None, zero state is used. Defaults to None.

    Returns
    ----------
    outputs : tensor
        A tensor with shape `[batch_size, hidden_size]`.
    states : tensor
        A tensor with shape `[batch_size, hidden_size]`.
        Tensor containing the next hidden state for each element in the batch


    Examples
    --------
    With TensorLayer

    >>> input = tl.layers.Input([4, 16], name='input')
    >>> prev_h = tl.layers.Input([4,32])
    >>> cell = tl.layers.RNNCell(input_size=16, hidden_size=32, bias=True, act='tanh', name='rnncell_1')
    >>> y, h = cell(input, prev_h)
    >>> print(y.shape)

    """

    def __init__(
        self,
        input_size,
        hidden_size,
        bias=True,
        act='tanh',
        name=None,
    ):
        super(RNNCell, self).__init__(name)
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        if act not in ('relu', 'tanh'):
            raise ValueError("Activation should be 'tanh' or 'relu'.")
        self.act = act
        self.build(None)
        logging.info("RNNCell %s: input_size: %d hidden_size: %d  act: %s" % (self.name, input_size, hidden_size, act))

    def __repr__(self):
        actstr = self.act
        s = ('{classname}(input_size={input_size}, hidden_size={hidden_size}')
        s += ', bias=True' if self.bias else ', bias=False'
        s += (',' + actstr)
        if self.name is not None:
            s += ', name=\'{name}\''
        s += ')'
        return s.format(classname=self.__class__.__name__, **self.__dict__)

    def check_input(self, input_shape):
        if input_shape[1] != self.input_size:
            raise ValueError(
                'input should have consistent input_size. But got {}, expected {}'.format(
                    input_shape[1], self.input_size
                )
            )

    def check_hidden(self, input_shape, h_shape, hidden_label):
        if input_shape[0] != h_shape[0]:
            raise ValueError(
                'input batch size{} should match hidden{} batch size{}.'.format(
                    input_shape[0], hidden_label, h_shape[0]
                )
            )
        if h_shape[1] != self.hidden_size:
            raise ValueError(
                'hidden{} should have consistent hidden_size. But got {},  expected {}.'.format(
                    hidden_label, h_shape[1], self.hidden_size
                )
            )

    def build(self, inputs_shape):
        stdv = 1.0 / np.sqrt(self.hidden_size)
        _init = tl.initializers.RandomUniform(minval=-stdv, maxval=stdv)
        self.weight_ih_shape = (self.hidden_size, self.input_size)
        self.weight_hh_shape = (self.hidden_size, self.hidden_size)
        self.weight_ih = self._get_weights("weight_ih", shape=self.weight_ih_shape, init=_init)
        self.weight_hh = self._get_weights("weight_hh", shape=self.weight_hh_shape, init=_init)

        if self.bias:
            self.bias_ih_shape = (self.hidden_size, )
            self.bias_hh_shape = (self.hidden_size, )
            self.bias_ih = self._get_weights('bias_ih', shape=self.bias_ih_shape, init=_init)
            self.bias_hh = self._get_weights('bias_hh', shape=self.bias_hh_shape, init=_init)
        else:
            self.bias_ih = None
            self.bias_hh = None
        self.rnncell = tl.ops.rnncell(
            weight_ih=self.weight_ih, weight_hh=self.weight_hh, bias_ih=self.bias_ih, bias_hh=self.bias_hh, act=self.act
        )

    def forward(self, inputs, states=None):
        input_shape = tl.get_tensor_shape(inputs)
        self.check_input(input_shape)
        if states is None:
            states = tl.zeros(shape=(input_shape[0], self.hidden_size), dtype=inputs.dtype)
        states_shape = tl.get_tensor_shape(states)
        self.check_hidden(input_shape, states_shape, hidden_label='h')
        output, states = self.rnncell(inputs, states)
        return output, states


class LSTMCell(Module):
    """A long short-term memory (LSTM) cell.

    Parameters
    ----------
    input_size : int
        The number of expected features in the input `x`
    hidden_size : int
        The number of features in the hidden state `h`
    bias : bool
        If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. Default: ``True``
    name : None or str
        A unique layer name
    --------------------------------------------------------
    inputs : tensor
        A tensor with shape `[batch_size, input_size]`.
    states : tuple or None
        A tuple of two tensor `(h, c)`, each of shape `[batch_size, hidden_size]`. When states is None, zero state is used. Defaults: None.

    Returns
    ----------
    outputs : tensor
        A tensor with shape `[batch_size, hidden_size]`.
    states : tensor
        A tuple of two tensor `(h, c)`, each of shape `[batch_size, hidden_size]`.
        Tensors containing the next hidden state and next cell state for each element in the batch.


    Examples
    --------
    With TensorLayer

    >>> input = tl.layers.Input([4, 16], name='input')
    >>> prev_h = tl.layers.Input([4,32])
    >>> prev_c = tl.layers.Input([4,32])
    >>> cell = tl.layers.LSTMCell(input_size=16, hidden_size=32, bias=True, name='lstmcell_1')
    >>> y, (h, c)= cell(input, (prev_h, prev_c))
    >>> print(y.shape)

    """

    def __init__(
        self,
        input_size,
        hidden_size,
        bias=True,
        name=None,
    ):
        super(LSTMCell, self).__init__(name)
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.build(None)
        logging.info("LSTMCell %s: input_size: %d hidden_size: %d " % (self.name, input_size, hidden_size))

    def __repr__(self):
        s = ('{classname}(input_size={input_size}, hidden_size={hidden_size}')
        s += ', bias=True' if self.bias else ', bias=False'
        if self.name is not None:
            s += ', name=\'{name}\''
        s += ')'
        return s.format(classname=self.__class__.__name__, **self.__dict__)

    def check_input(self, input_shape):
        if input_shape[1] != self.input_size:
            raise ValueError(
                'input should have consistent input_size. But got {}, expected {}'.format(
                    input_shape[1], self.input_size
                )
            )

    def check_hidden(self, input_shape, h_shape, hidden_label):
        if input_shape[0] != h_shape[0]:
            raise ValueError(
                'input batch size{} should match hidden{} batch size{}.'.format(
                    input_shape[0], hidden_label, h_shape[0]
                )
            )
        if h_shape[1] != self.hidden_size:
            raise ValueError(
                'hidden{} should have consistent hidden_size. But got {},  expected {}.'.format(
                    hidden_label, h_shape[1], self.hidden_size
                )
            )

    def build(self, inputs_shape):
        stdv = 1.0 / np.sqrt(self.hidden_size)
        _init = tl.initializers.RandomUniform(minval=-stdv, maxval=stdv)
        self.weight_ih_shape = (4 * self.hidden_size, self.input_size)
        self.weight_hh_shape = (4 * self.hidden_size, self.hidden_size)
        self.weight_ih = self._get_weights("weight_ih", shape=self.weight_ih_shape, init=_init)
        self.weight_hh = self._get_weights("weight_hh", shape=self.weight_hh_shape, init=_init)

        if self.bias:
            self.bias_ih_shape = (4 * self.hidden_size, )
            self.bias_hh_shape = (4 * self.hidden_size, )
            self.bias_ih = self._get_weights('bias_ih', shape=self.bias_ih_shape, init=_init)
            self.bias_hh = self._get_weights('bias_hh', shape=self.bias_hh_shape, init=_init)
        else:
            self.bias_ih = None
            self.bias_hh = None

        self.lstmcell = tl.ops.lstmcell(
            weight_ih=self.weight_ih, weight_hh=self.weight_hh, bias_ih=self.bias_ih, bias_hh=self.bias_hh
        )

    def forward(self, inputs, states=None):
        input_shape = tl.get_tensor_shape(inputs)
        self.check_input(input_shape)
        if states is not None:
            h, c = states
        else:
            h = tl.zeros(shape=(input_shape[0], self.hidden_size), dtype=inputs.dtype)
            c = tl.zeros(shape=(input_shape[0], self.hidden_size), dtype=inputs.dtype)
        h_shape = tl.get_tensor_shape(h)
        c_shape = tl.get_tensor_shape(c)
        self.check_hidden(input_shape, h_shape, hidden_label='h')
        self.check_hidden(input_shape, c_shape, hidden_label='c')
        output, new_h, new_c = self.lstmcell(inputs, h, c)
        return output, (new_h, new_c)


class GRUCell(Module):
    """A gated recurrent unit (GRU) cell.

    Parameters
    ----------
    input_size : int
        The number of expected features in the input `x`
    hidden_size : int
        The number of features in the hidden state `h`
    bias : bool
        If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. Default: ``True``
    name : None or str
        A unique layer name
    --------------------------------------------------------
    inputs : tensor
        A tensor with shape `[batch_size, input_size]`.
    states : tensor or None
        A tensor with shape `[batch_size, hidden_size]`. When states is None, zero state is used. Defaults: `None`.

    Returns
    ----------
    outputs : tensor
        A tensor with shape `[batch_size, hidden_size]`.
    states : tensor
        A tensor with shape `[batch_size, hidden_size]`.
        Tensor containing the next hidden state for each element in the batch


    Examples
    --------
    With TensorLayer

    >>> input = tl.layers.Input([4, 16], name='input')
    >>> prev_h = tl.layers.Input([4,32])
    >>> cell = tl.layers.GRUCell(input_size=16, hidden_size=32, bias=True, name='grucell_1')
    >>> y, h= cell(input, prev_h)
    >>> print(y.shape)

    """

    def __init__(
        self,
        input_size,
        hidden_size,
        bias=True,
        name=None,
    ):
        super(GRUCell, self).__init__(name)
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.build(None)
        logging.info("GRUCell %s: input_size: %d hidden_size: %d " % (self.name, input_size, hidden_size))

    def __repr__(self):
        s = ('{classname}(input_size={input_size}, hidden_size={hidden_size}')
        s += ', bias=True' if self.bias else ', bias=False'
        if self.name is not None:
            s += ', name=\'{name}\''
        s += ')'
        return s.format(classname=self.__class__.__name__, **self.__dict__)

    def check_input(self, input_shape):
        if input_shape[1] != self.input_size:
            raise ValueError(
                'input should have consistent input_size. But got {}, expected {}'.format(
                    input_shape[1], self.input_size
                )
            )

    def check_hidden(self, input_shape, h_shape, hidden_label):
        if input_shape[0] != h_shape[0]:
            raise ValueError(
                'input batch size{} should match hidden{} batch size{}.'.format(
                    input_shape[0], hidden_label, h_shape[0]
                )
            )
        if h_shape[1] != self.hidden_size:
            raise ValueError(
                'hidden{} should have consistent hidden_size. But got {},  expected {}.'.format(
                    hidden_label, h_shape[1], self.hidden_size
                )
            )

    def build(self, inputs_shape):
        stdv = 1.0 / np.sqrt(self.hidden_size)
        _init = tl.initializers.RandomUniform(minval=-stdv, maxval=stdv)
        self.weight_ih_shape = (3 * self.hidden_size, self.input_size)
        self.weight_hh_shape = (3 * self.hidden_size, self.hidden_size)
        self.weight_ih = self._get_weights("weight_ih", shape=self.weight_ih_shape, init=_init)
        self.weight_hh = self._get_weights("weight_hh", shape=self.weight_hh_shape, init=_init)

        if self.bias:
            self.bias_ih_shape = (3 * self.hidden_size, )
            self.bias_hh_shape = (3 * self.hidden_size, )
            self.bias_ih = self._get_weights('bias_ih', shape=self.bias_ih_shape, init=_init)
            self.bias_hh = self._get_weights('bias_hh', shape=self.bias_hh_shape, init=_init)
        else:
            self.bias_ih = None
            self.bias_hh = None

        self.grucell = tl.ops.grucell(
            weight_ih=self.weight_ih, weight_hh=self.weight_hh, bias_ih=self.bias_ih, bias_hh=self.bias_hh
        )

    def forward(self, inputs, states=None):
        input_shape = tl.get_tensor_shape(inputs)
        self.check_input(input_shape)
        if states is None:
            states = tl.zeros(shape=(input_shape[0], self.hidden_size), dtype=inputs.dtype)
        states_shape = tl.get_tensor_shape(states)
        self.check_hidden(input_shape, states_shape, hidden_label='h')
        output, states = self.grucell(inputs, states)
        return output, states


class RNNBase(Module):
    """
    RNNBase class for RNN networks. It provides `forward` and other common methods for RNN, LSTM and GRU.
    """

    def __init__(
        self,
        mode,
        input_size,
        hidden_size,
        num_layers=1,
        bias=True,
        batch_first=False,
        dropout=0.0,
        bidirectional=False,
        name=None,
    ):
        super(RNNBase, self).__init__(name)
        self.mode = mode
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bias = bias
        self.batch_first = batch_first
        self.dropout = dropout
        self.bidirectional = bidirectional
        self.build(None)

        logging.info(
            "%s: %s: input_size: %d hidden_size: %d  num_layers: %d " %
            (self.mode, self.name, input_size, hidden_size, num_layers)
        )

    def __repr__(self):
        s = (
            '{classname}(input_size={input_size}, hidden_size={hidden_size}, num_layers={num_layers}'
            ', dropout={dropout}'
        )
        s += ', bias=True' if self.bias else ', bias=False'
        s += ', bidirectional=True' if self.bidirectional else ', bidirectional=False'
        if self.name is not None:
            s += ', name=\'{name}\''
        s += ')'
        return s.format(classname=self.__class__.__name__, **self.__dict__)

    def build(self, inputs_shape):
        if BACKEND == 'tensorflow':
            bidirect = 2 if self.bidirectional else 1
            self.weights_fw = []
            self.bias_fw = []
            self.weights_bw = []
            self.bias_bw = []
            stdv = 1.0 / np.sqrt(self.hidden_size)
            _init = tl.initializers.RandomUniform(minval=-stdv, maxval=stdv)
            if self.mode == 'LSTM':
                gate_size = 4 * self.hidden_size
            elif self.mode == 'GRU':
                gate_size = 3 * self.hidden_size
            else:
                gate_size = self.hidden_size
            for layer in range(self.num_layers):
                for direction in range(bidirect):
                    layer_input_size = self.input_size if layer == 0 else self.hidden_size * bidirect
                    if direction == 0:
                        self.w_ih = self._get_weights(
                            'weight_ih_l' + str(layer), shape=(gate_size, layer_input_size), init=_init
                        )
                        self.w_hh = self._get_weights(
                            'weight_ih_l' + str(layer), shape=(gate_size, self.hidden_size), init=_init
                        )
                        self.weights_fw.append(self.w_ih)
                        self.weights_fw.append(self.w_hh)
                        if self.bias:
                            self.b_ih = self._get_weights('bias_ih_l' + str(layer), shape=(gate_size, ), init=_init)
                            self.b_hh = self._get_weights('bias_hh_l' + str(layer), shape=(gate_size, ), init=_init)
                            self.bias_fw.append(self.b_ih)
                            self.bias_fw.append(self.b_hh)
                    else:
                        self.w_ih = self._get_weights(
                            'weight_ih_l' + str(layer) + '_reverse', shape=(gate_size, layer_input_size), init=_init
                        )
                        self.w_hh = self._get_weights(
                            'weight_hh_l' + str(layer) + '_reverse', shape=(gate_size, self.hidden_size), init=_init
                        )
                        self.weights_bw.append(self.w_ih)
                        self.weights_bw.append(self.w_hh)
                        if self.bias:
                            self.b_ih = self._get_weights(
                                'bias_ih_l' + str(layer) + '_reverse', shape=(gate_size, ), init=_init
                            )
                            self.b_hh = self._get_weights(
                                'bias_hh_l' + str(layer) + '_reverse', shape=(gate_size, ), init=_init
                            )
                            self.bias_bw.append(self.b_ih)
                            self.bias_bw.append(self.b_hh)

            self.rnn = tl.ops.rnnbase(
                mode=self.mode, input_size=self.input_size, hidden_size=self.hidden_size, num_layers=self.num_layers,
                bias=self.bias, batch_first=self.batch_first, dropout=self.dropout, bidirectional=self.bidirectional,
                is_train=self.is_train, weights_fw=self.weights_fw, weights_bw=self.weights_bw, bias_fw=self.bias_fw,
                bias_bw=self.bias_bw
            )
        else:
            self.rnn = tl.ops.rnnbase(
                mode=self.mode,
                input_size=self.input_size,
                hidden_size=self.hidden_size,
                num_layers=self.num_layers,
                bias=self.bias,
                batch_first=self.batch_first,
                dropout=self.dropout,
                bidirectional=self.bidirectional,
                is_train=self.is_train,
            )

    def forward(self, input, states=None):

        output, new_states = self.rnn(input, states)
        return output, new_states


class RNN(RNNBase):
    """Multilayer Elman network(RNN). It takes input sequences and initial
    states as inputs, and returns the output sequences and the final states.

    Parameters
    ----------
    input_size : int
        The number of expected features in the input `x`
    hidden_size : int
        The number of features in the hidden state `h`
    num_layers : int
        Number of recurrent layers.  Default: 1
    bias : bool
        If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. Default: ``True``
    batch_first : bool
        If ``True``, then the input and output tensors are provided as `[batch_size, seq, input_size]`, Default: ``False``
    dropout : float
        If non-zero, introduces a `Dropout` layer on the outputs of each RNN layer except the last layer,
        with dropout probability equal to `dropout`. Default: 0
    bidirectional : bool
        If ``True``, becomes a bidirectional RNN. Default: ``False``
    act : activation function
        The non-linearity to use. Can be either 'tanh' or 'relu'. Default: 'tanh'
    name : None or str
        A unique layer name
    --------------------------------------------------------
    inputs : tensor
        the input sequence. if `batch_first` is True, the shape is `[batch_size, seq, input_size]`, else, the shape is `[seq, batch_size, input_size]`.
    initial_states : tensor or None
        the initial states. The shape is `[num_layers * num_directions, batch_size, hidden_size]`.If initial_state is not given, zero initial states are used.
        If the RNN is Bidirectional, num_directions should be 2, else it should be 1. Default: None.

    Returns
    ----------
    outputs : tensor
        the output sequence. if `batch_first` is True, the shape is `[batch_size, seq, num_directions * hidden_size]`,
        else, the shape is `[seq, batch_size, num_directions * hidden_size]`.
    final_states : tensor
        final states. The shape is `[num_layers * num_directions, batch_size, hidden_size]`. Note that if the RNN is Bidirectional, the forward states are (0,2,4,6,...) and
        the backward states are (1,3,5,7,....).

    Examples
    --------
    With TensorLayer

    >>> input = tl.layers.Input([23, 32, 16], name='input')
    >>> prev_h = tl.layers.Input([4, 32, 32])
    >>> cell = tl.layers.RNN(input_size=16, hidden_size=32, bias=True, num_layers=2, bidirectional = True, act='tanh', batch_first=False, dropout=0, name='rnn_1')
    >>> y, h= cell(input, prev_h)
    >>> print(y.shape)

    """

    def __init__(
        self,
        input_size,
        hidden_size,
        num_layers=1,
        bias=True,
        batch_first=False,
        dropout=0.0,
        bidirectional=False,
        act='tanh',
        name=None,
    ):
        if act == 'tanh':
            mode = 'RNN_TANH'
        elif act == 'relu':
            mode = 'RNN_RELU'
        else:
            raise ValueError("act should be in ['tanh', 'relu'], but got {}.".format(act))
        super(RNN, self
             ).__init__(mode, input_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional, name)


class LSTM(RNNBase):
    """Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence.

    Parameters
    ----------
    input_size : int
        The number of expected features in the input `x`
    hidden_size : int
        The number of features in the hidden state `h`
    num_layers : int
        Number of recurrent layers.  Default: 1
    bias : bool
        If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. Default: ``True``
    batch_first : bool
        If ``True``, then the input and output tensors are provided as `[batch_size, seq, input_size]`, Default: ``False``
    dropout : float
        If non-zero, introduces a `Dropout` layer on the outputs of each LSTM layer except the last layer,
        with dropout probability equal to `dropout`. Default: 0
    bidirectional : bool
        If ``True``, becomes a bidirectional LSTM. Default: ``False``
    name : None or str
        A unique layer name
    --------------------------------------------------------
    inputs : tensor
        the input sequence. if `batch_first` is True, the shape is `[batch_size, seq, input_size]`, else, the shape is `[seq, batch_size, input_size]`.
    initial_states : tensor or None
        the initial states. A tuple of tensor (h, c), the shape of each is `[num_layers * num_directions, batch_size, hidden_size]`.If initial_state is not given, zero initial states are used.
        If the LSTM is Bidirectional, num_directions should be 2, else it should be 1. Default: None.

    Returns
    ----------
    outputs : tensor
        the output sequence. if `batch_first` is True, the shape is `[batch_size, seq, num_directions * hidden_size]`,
        else, the shape is `[seq, batch_size, num_directions * hidden_size]`.
    final_states : tensor
        final states. A tuple of two tensor. The shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. Note that if the LSTM is Bidirectional, the forward states are (0,2,4,6,...) and
        the backward states are (1,3,5,7,....).

    Examples
    --------
    With TensorLayer

    >>> input = tl.layers.Input([23, 32, 16], name='input')
    >>> prev_h = tl.layers.Input([4, 32, 32])
    >>> prev_c = tl.layers.Input([4, 32, 32])
    >>> cell = tl.layers.LSTM(input_size=16, hidden_size=32, bias=True, num_layers=2, bidirectional = True,  batch_first=False, dropout=0, name='lstm_1')
    >>> y, (h, c)= cell(input, (prev_h, prev_c))
    >>> print(y.shape)

    """

    def __init__(
        self,
        input_size,
        hidden_size,
        num_layers=1,
        bias=True,
        batch_first=False,
        dropout=0.0,
        bidirectional=False,
        name=None,
    ):
        super(LSTM, self
             ).__init__('LSTM', input_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional, name)


class GRU(RNNBase):
    """Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.

    Parameters
    ----------
    input_size : int
        The number of expected features in the input `x`
    hidden_size : int
        The number of features in the hidden state `h`
    num_layers : int
        Number of recurrent layers. Default: 1
    bias : bool
        If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. Default: ``True``
    batch_first : bool
        If ``True``, then the input and output tensors are provided as `[batch_size, seq, input_size]`, Default: ``False``
    dropout : float
        If non-zero, introduces a `Dropout` layer on the outputs of each GRU layer except the last layer,
        with dropout probability equal to `dropout`. Default: 0
    bidirectional : bool
        If ``True``, becomes a bidirectional LSTM. Default: ``False``
    name : None or str
        A unique layer name
    --------------------------------------------------------
    inputs : tensor
        the input sequence. if `batch_first` is True, the shape is `[batch_size, seq, input_size]`, else, the shape is `[seq, batch_size, input_size]`.
    initial_states : tensor or None
        the initial states. A tuple of tensor (h, c), the shape of each is `[num_layers * num_directions, batch_size, hidden_size]`.If initial_state is not given, zero initial states are used.
        If the GRU is Bidirectional, num_directions should be 2, else it should be 1. Default: None.

    Returns
    ----------
    outputs : tensor
        the output sequence. if `batch_first` is True, the shape is `[batch_size, seq, num_directions * hidden_size]`,
        else, the shape is `[seq, batch_size, num_directions * hidden_size]`.
    final_states : tensor
        final states. A tuple of two tensor. The shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. Note that if the GRU is Bidirectional, the forward states are (0,2,4,6,...) and
        the backward states are (1,3,5,7,....).

    Examples
    --------
    With TensorLayer

    >>> input = tl.layers.Input([23, 32, 16], name='input')
    >>> prev_h = tl.layers.Input([4, 32, 32])
    >>> cell = tl.layers.GRU(input_size=16, hidden_size=32, bias=True, num_layers=2, bidirectional = True,  batch_first=False, dropout=0, name='GRU_1')
    >>> y, h= cell(input, prev_h)
    >>> print(y.shape)

    """

    def __init__(
        self,
        input_size,
        hidden_size,
        num_layers=1,
        bias=True,
        batch_first=False,
        dropout=0.0,
        bidirectional=False,
        name=None,
    ):
        super(GRU, self
             ).__init__('GRU', input_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional, name)