Source code for speeq.models.encoders

"""This module provides various speech encoders.

The available encoders are:

- DeepSpeechV1Encoder: The encoder implementation of the DeepSpeech V1 model.
- DeepSpeechV2Encoder: The encoder implementation of the DeepSpeech V2 model.
- ConformerEncoder: The encoder implementation of the Conformer model.
- JasperEncoder: The encoder implementation of the Jasper model.
- Wav2LetterEncoder: The encoder implementation of the Wav2Letter model.
- QuartzNetEncoder: The encoder implementation of the QuartzNet model.
- SqueezeformerEncoder: The encoder implementation of the Squeezeformer model.
- SpeechTransformerEncoder: The encoder implementation of the Speech Transformer model.
- RNNEncoder: The encoder implementation of a general RNN model.
- PyramidRNNEncoder: The encoder implementation of the Pyramid RNN model.
- ContextNetEncoder: The encoder implementation of the ContextNet model.
- VGGTransformerEncoder: The encoder implementation of the VGG-Transformer.
- TransformerTransducerEncoder: The encoder implementation of the transformer transducer with relative truncated multi-head self-attention.

Each encoder takes a speech input of shape [B, M, d], and the lengths if
shape [B], where B is the batch size, M is the length of
the speech sequence, and d is the number of features.
"""
from typing import List, Optional, Tuple, Union

import torch
from torch import Tensor, nn

from speeq.utils.utils import add_pos_enc, calc_data_len, get_mask_from_lens

from .activations import CReLu
from .layers import (
    ConformerBlock,
    ConformerPreNet,
    ContextNetBlock,
    Conv1DLayers,
    JasperBlocks,
    JasperSubBlock,
    QuartzBlocks,
    RowConv1D,
    SpeechTransformerEncLayer,
    SqueezeformerBlock,
    TransformerEncLayer,
    TransformerEncLayerWithAttTruncation,
    TransformerTransducerLayer,
    VGGTransformerPreNet,
)


[docs]class DeepSpeechV1Encoder(nn.Module):
    """Builds the DeepSpeech encoder described in
    https://arxiv.org/abs/1412.5567

    Args:

        in_features (int): The input feature size.

        hidden_size (int): The hidden size of the rnn layers.

        n_linear_layers (int): The number of feed-forward layers.

        bidirectional (bool): if the rnn is bidirectional or not.

        max_clip_value (int): The maximum relu clipping value.

        rnn_type (str): The RNN type it has to be one of rnn, gru or lstm.

        p_dropout (float): The dropout rate.
    """

    def __init__(
        self,
        in_features: int,
        hidden_size: int,
        n_linear_layers: int,
        bidirectional: bool,
        max_clip_value: int,
        rnn_type: str,
        p_dropout: float,
    ) -> None:
        super().__init__()
        self.ff_layers = nn.ModuleList(
            [
                nn.Sequential(
                    nn.Linear(
                        in_features=in_features if i == 0 else hidden_size,
                        out_features=hidden_size,
                    ),
                    CReLu(max_val=max_clip_value),
                    nn.Dropout(p=p_dropout),
                )
                for i in range(n_linear_layers)
            ]
        )
        from .registry import PACKED_RNN_REGISTRY

        self.rnn = PACKED_RNN_REGISTRY[rnn_type](
            input_size=hidden_size, hidden_size=hidden_size, bidirectional=bidirectional
        )
        self.fc = nn.Linear(
            in_features=hidden_size,
            out_features=hidden_size,
        )
        self.crelu = CReLu(max_val=max_clip_value)
        self.bidirectional = bidirectional
        self.hidden_size = hidden_size

[docs]    def forward(
        self, x: Tensor, mask: Tensor, *args, **kwargs
    ) -> Tuple[Tensor, Tensor]:
        """Passes the input `x` through the encoder layers.

        Args:

            x (Tensor): The input speech tensor of shape [B, M, d]

            mask (Tensor): The input boolean input mask of shape [B, M], where it's True
            if there is no padding.

        Returns:

            Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape
            [B, M, F] and the second element is the lengths of shape [B].
        """
        lengths = mask.sum(dim=-1)
        for layer in self.ff_layers:
            x = layer(x)
        out, _, lengths = self.rnn(x, lengths.cpu())
        out = self.crelu(out)
        if self.bidirectional is True:
            out = out[..., : self.hidden_size] + out[..., self.hidden_size :]
        out = self.crelu(self.fc(out))
        return out, lengths


[docs]class DeepSpeechV2Encoder(nn.Module):
    """Implements the deep speech 2 encoder proposed in
    https://arxiv.org/abs/1512.02595

    Args:
        n_conv (int): The number of convolution layers.

        kernel_size (int): The kernel size of the convolution layers.

        stride (int): The stride size of the convolution layer.

        in_features (int): The input/speech feature size.

        hidden_size (int): The hidden size of the RNN layers.

        bidirectional (bool): A flag indicating if the rnn is bidirectional or not.

        n_rnn (int): The number of RNN layers.

        n_linear_layers (int): The number of linear layers.

        max_clip_value (int): The maximum relu clipping value.

        rnn_type (str): The RNN type it has to be one of rnn, gru or lstm.

        tau (int): The future context size.

        p_dropout (float): The dropout rate.
    """

    def __init__(
        self,
        n_conv: int,
        kernel_size: int,
        stride: int,
        in_features: int,
        hidden_size: int,
        bidirectional: bool,
        n_rnn: int,
        n_linear_layers: int,
        max_clip_value: int,
        rnn_type: str,
        tau: int,
        p_dropout: float,
    ) -> None:
        super().__init__()
        self.conv = Conv1DLayers(
            in_size=in_features,
            out_size=hidden_size,
            kernel_size=kernel_size,
            stride=stride,
            n_layers=n_conv,
            p_dropout=p_dropout,
            activation=CReLu(max_val=max_clip_value),
        )
        from .registry import PACKED_RNN_REGISTRY

        self.rnns = nn.ModuleList(
            [
                PACKED_RNN_REGISTRY[rnn_type](
                    input_size=hidden_size,
                    hidden_size=hidden_size,
                    bidirectional=bidirectional,
                )
                for _ in range(n_rnn)
            ]
        )
        self.rnn_bnorms = nn.ModuleList(
            [nn.BatchNorm1d(num_features=hidden_size) for _ in range(n_rnn)]
        )
        self.linear_layers = nn.ModuleList(
            [
                nn.Linear(in_features=hidden_size, out_features=hidden_size)
                for _ in range(n_linear_layers)
            ]
        )
        self.linear_bnorms = nn.ModuleList(
            [nn.BatchNorm1d(num_features=hidden_size) for _ in range(n_linear_layers)]
        )
        self.crelu = CReLu(max_val=max_clip_value)
        self.context_conv = RowConv1D(tau=tau, feat_size=hidden_size)
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional

[docs]    def forward(
        self, x: Tensor, mask: Tensor, *args, **kwargs
    ) -> Tuple[Tensor, Tensor]:
        """Passes the input `x` through the encoder layers.

        Args:

            x (Tensor): The input speech tensor of shape [B, M, d]

            mask (Tensor): The input boolean input mask of shape [B, M], where it's True
            if there is no padding.

        Returns:

            Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape
            [B, M, F] and the second element is the lengths of shape [B].
        """
        lengths = mask.sum(dim=-1)
        lengths = lengths.cpu()
        out, lengths = self.conv(x, lengths)
        out = self.crelu(out)
        for bnorm, layer in zip(self.rnn_bnorms, self.rnns):
            out = out.transpose(-1, -2)
            out = bnorm(out)
            out = out.transpose(-1, -2)
            out, _, lengths = layer(out, lengths)
            if self.bidirectional is True:
                out = out[..., : self.hidden_size] + out[..., self.hidden_size :]
            out = self.crelu(out)
        out = self.context_conv(out)
        for bnorm, layer in zip(self.linear_bnorms, self.linear_layers):
            out = layer(out)
            out = out.transpose(-1, -2)
            out = bnorm(out)
            out = out.transpose(-1, -2)
            out = self.crelu(out)
        return out, lengths


[docs]class ConformerEncoder(nn.Module):
    """Implements the conformer encoder proposed in
    https://arxiv.org/abs/2005.08100

    Args:
        d_model (int): The model dimension.

        n_conf_layers (int): The number of conformer blocks.

        ff_expansion_factor (int): The feed-forward expansion factor.

        h (int): The number of attention heads.

        kernel_size (int): The convolution module kernel size.

        ss_kernel_size (int): The subsampling layer kernel size.

        ss_stride (int): The subsampling layer stride size.

        ss_num_conv_layers (int): The number of subsampling convolutional layers.

        in_features (int): The input/speech feature size.

        res_scaling (float): The residual connection multiplier.

        p_dropout (float): The dropout rate.
    """

    def __init__(
        self,
        d_model: int,
        n_conf_layers: int,
        ff_expansion_factor: int,
        h: int,
        kernel_size: int,
        ss_kernel_size: int,
        ss_stride: int,
        ss_num_conv_layers: int,
        in_features: int,
        res_scaling: float,
        p_dropout: float,
    ) -> None:
        super().__init__()
        self.sub_sampling = ConformerPreNet(
            in_features=in_features,
            kernel_size=ss_kernel_size,
            stride=ss_stride,
            n_conv_layers=ss_num_conv_layers,
            d_model=d_model,
            p_dropout=p_dropout,
        )
        self.blocks = nn.ModuleList(
            [
                ConformerBlock(
                    d_model=d_model,
                    ff_expansion_factor=ff_expansion_factor,
                    h=h,
                    kernel_size=kernel_size,
                    p_dropout=p_dropout,
                    res_scaling=res_scaling,
                )
                for _ in range(n_conf_layers)
            ]
        )

[docs]    def forward(
        self, x: Tensor, mask: Tensor, *args, **kwargs
    ) -> Tuple[Tensor, Tensor]:
        """Passes the input `x` through the encoder layers.

        Args:

            x (Tensor): The input speech tensor of shape [B, M, d]

            mask (Tensor): The input boolean input mask of shape [B, M], where it's True
            if there is no padding.

        Returns:

            Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape
            [B, M, F] and the second element is the lengths of shape [B].
        """
        lengths = mask.sum(dim=-1)
        lengths = lengths.cpu()
        out, lengths = self.sub_sampling(x, lengths)
        mask = get_mask_from_lens(lengths, lengths.max().item())
        mask = mask.to(x.device)
        for layer in self.blocks:
            out = layer(out, mask)
        return out, lengths


[docs]class JasperEncoder(nn.Module):
    """Implements Jasper's encoder proposed in https://arxiv.org/abs/1904.03288

    Args:

        in_features (int): The input/speech feature size.

        num_blocks (int): The number of Jasper blocks (denoted as 'B' in the paper).

        num_sub_blocks (int): The number of Jasper subblocks (denoted as 'R' in the paper).

        channel_inc (int): The rate to increase the number of channels across the blocks.

        epilog_kernel_size (int): The kernel size of the epilog block convolution layer.

        prelog_kernel_size (int): The kernel size of the prelog block ocnvolution layer.

        prelog_stride (int): The stride size of the prelog block convolution layer.

        prelog_n_channels (int): The output channnels of the prelog block convolution layer.

        blocks_kernel_size (Union[int, List[int]]): The kernel size(s) of the convolution layer for each block.

        p_dropout (float): The dropout rate.

    """

    def __init__(
        self,
        in_features: int,
        num_blocks: int,
        num_sub_blocks: int,
        channel_inc: int,
        epilog_kernel_size: int,
        prelog_kernel_size: int,
        prelog_stride: int,
        prelog_n_channels: int,
        blocks_kernel_size: Union[int, List[int]],
        p_dropout: float,
    ) -> None:
        super().__init__()
        self.prelog = JasperSubBlock(
            in_channels=in_features,
            out_channels=prelog_n_channels,
            kernel_size=prelog_kernel_size,
            p_dropout=p_dropout,
            padding=0,
            stride=prelog_stride,
        )
        self.prelog_stride = prelog_stride
        self.prelog_kernel_size = prelog_kernel_size
        self.blocks = JasperBlocks(
            num_blocks=num_blocks,
            num_sub_blocks=num_sub_blocks,
            in_channels=prelog_n_channels,
            channel_inc=channel_inc,
            kernel_size=blocks_kernel_size,
            p_dropout=p_dropout,
        )
        self.epilog1 = JasperSubBlock(
            in_channels=prelog_n_channels + channel_inc * num_blocks,
            out_channels=prelog_n_channels + channel_inc * (1 + num_blocks),
            kernel_size=epilog_kernel_size,
            p_dropout=p_dropout,
        )
        self.epilog2 = JasperSubBlock(
            in_channels=prelog_n_channels + channel_inc * (1 + num_blocks),
            out_channels=prelog_n_channels + channel_inc * (2 + num_blocks),
            kernel_size=1,
            p_dropout=p_dropout,
        )

[docs]    def forward(
        self, x: Tensor, mask: Tensor, *args, **kwargs
    ) -> Tuple[Tensor, Tensor]:
        """Passes the input `x` through the encoder layers.

        Args:

            x (Tensor): The input speech tensor of shape [B, M, d]

            mask (Tensor): The input boolean input mask of shape [B, M], where it's True
            if there is no padding.

        Returns:

            Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape
            [B, M, F] and the second element is the lengths of shape [B].
        """
        lengths = mask.sum(dim=-1)
        lengths = lengths.cpu()
        x = x.transpose(-1, -2)
        out = self.prelog(x)
        lengths = calc_data_len(
            result_len=out.shape[-1],
            pad_len=x.shape[-1] - lengths,
            data_len=lengths,
            kernel_size=self.prelog_kernel_size,
            stride=self.prelog_stride,
        )
        out = self.blocks(out)
        out = self.epilog1(out)
        out = self.epilog2(out)
        out = out.transpose(-1, -2)  # [B, M, d']
        return out, lengths


[docs]class Wav2LetterEncoder(nn.Module):
    """Implements the Wav2Letter encoder proposed in
    https://arxiv.org/abs/1609.03193

    Args:

        in_features (int): The input/speech feature size.

        n_conv_layers (int): The number of convolution layers.

        layers_kernel_size (int): The kernel size of the convolution layers.

        layers_channels_size (int): The number of output channels of each convolution layer.

        pre_conv_stride (int): The stride of the prenet convolution layer.

        pre_conv_kernel_size (int): The kernel size of the prenet convolution layer.

        post_conv_channels_size (int): The number of output channels of the
        postnet convolution layer.

        post_conv_kernel_size (int): The kernel size of the postnet convolution layer.

        p_dropout (float): The dropout rate.

        wav_kernel_size (Optional[int]): The kernel size of the first layer that
        processes the wav samples directly if wav is modeled. Default None.

        wav_stride (Optional[int]): The stride size of the first layer that
        processes the wav samples directly if wav is modeled. Default None.
    """

    def __init__(
        self,
        in_features: int,
        n_conv_layers: int,
        layers_kernel_size: int,
        layers_channels_size: int,
        pre_conv_stride: int,
        pre_conv_kernel_size: int,
        post_conv_channels_size: int,
        post_conv_kernel_size: int,
        p_dropout: float,
        wav_kernel_size: Optional[int] = None,
        wav_stride: Optional[int] = None,
    ) -> None:
        super().__init__()
        self.is_wav = in_features == 1
        if self.is_wav:
            assert wav_kernel_size is not None
            assert wav_stride is not None
            self.raw_conv = nn.Conv1d(
                in_channels=1,
                out_channels=layers_channels_size,
                kernel_size=wav_kernel_size,
                stride=wav_stride,
            )
        self.pre_conv = nn.Conv1d(
            in_channels=layers_channels_size if self.is_wav else in_features,
            out_channels=layers_channels_size,
            kernel_size=pre_conv_kernel_size,
            stride=pre_conv_stride,
        )
        self.convs = nn.ModuleList(
            [
                nn.Conv1d(
                    in_channels=layers_channels_size,
                    out_channels=layers_channels_size,
                    kernel_size=layers_kernel_size,
                    padding="same",
                )
                for _ in range(n_conv_layers - 1)
            ]
        )
        self.convs.append(
            nn.Conv1d(
                in_channels=layers_channels_size,
                out_channels=post_conv_channels_size,
                kernel_size=post_conv_kernel_size,
                padding="same",
            )
        )
        self.post_conv = nn.Conv1d(
            in_channels=post_conv_channels_size,
            out_channels=post_conv_channels_size,
            kernel_size=1,
            padding="same",
        )
        self.dropout = nn.Dropout(p_dropout)

[docs]    def forward(
        self, x: Tensor, mask: Tensor, *args, **kwargs
    ) -> Tuple[Tensor, Tensor]:
        """Passes the input `x` through the encoder layers.

        Args:

            x (Tensor): The input speech tensor of shape [B, M, d]

            mask (Tensor): The input boolean input mask of shape [B, M], where it's True
            if there is no padding.

        Returns:

            Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape
            [B, M, F] and the second element is the lengths of shape [B].
        """
        lengths = mask.sum(dim=-1)
        lengths = lengths.cpu()
        x = x.transpose(-1, -2)
        out = x
        if self.is_wav:
            out = self.raw_conv(out)
            out = torch.tanh(out)
            out = self.dropout(out)
            lengths = calc_data_len(
                result_len=out.shape[-1],
                pad_len=x.shape[-1] - lengths,
                data_len=lengths,
                kernel_size=self.raw_conv.kernel_size[0],
                stride=self.raw_conv.stride[0],
            )
        results = self.pre_conv(out)
        lengths = calc_data_len(
            result_len=results.shape[-1],
            pad_len=out.shape[-1] - lengths,
            data_len=lengths,
            kernel_size=self.pre_conv.kernel_size[0],
            stride=self.pre_conv.stride[0],
        )
        out = results
        out = torch.tanh(out)
        out = self.dropout(out)
        for layer in self.convs:
            out = layer(out)
            out = torch.tanh(out)
            out = self.dropout(out)
        out = self.post_conv(out)
        out = torch.tanh(out)
        out = self.dropout(out)
        out = out.transpose(-1, -2)  # [B, M, d]
        return out, lengths


[docs]class QuartzNetEncoder(JasperEncoder):
    """Implements QuartzNet encoder proposed in https://arxiv.org/abs/1910.10261

    Args:

        in_features (int): The input/speech feature size.

        num_blocks (int): The number of QuartzNet blocks (denoted as 'B' in the paper).

        block_repetition (int): The number of times to repeat each block (denoted as 'S' in the paper).

        num_sub_blocks (int): The number of QuartzNet subblocks, (denoted as 'R' in the paper).

        channels_size (List[int]): A list of integers representing the number of output channels
        for each block.

        epilog_kernel_size (int): The kernel size of the convolution layer in the epilog block.

        epilog_channel_size (Tuple[int, int]): A tuple for both epilog layers
        of the convolution layer .

        prelog_kernel_size (int): The kernel size pf the convolution layer in the prelog block.

        prelog_stride (int): The stride size of the of the convoltuional layer
        in the prelog block.

        prelog_n_channels (int): The number of output channels of the convolutional
        layer in the prelog block.

        groups (int): The groups size.

        blocks_kernel_size (Union[int, List[int]]): An integer or a list of integers representing the
        kernel size(s) for each block's convolutional layer.

        p_dropout (float): The dropout rate.

    """

    def __init__(
        self,
        in_features: int,
        num_blocks: int,
        block_repetition: int,
        num_sub_blocks: int,
        channels_size: List[int],
        epilog_kernel_size: int,
        epilog_channel_size: Tuple[int, int],
        prelog_kernel_size: int,
        prelog_stride: int,
        prelog_n_channels: int,
        groups: int,
        blocks_kernel_size: Union[int, List[int]],
        p_dropout: float,
    ) -> None:
        super().__init__(
            in_features=in_features,
            num_blocks=num_blocks,
            num_sub_blocks=num_sub_blocks,
            channel_inc=0,
            epilog_kernel_size=epilog_kernel_size,
            prelog_kernel_size=prelog_kernel_size,
            prelog_stride=prelog_stride,
            prelog_n_channels=prelog_n_channels,
            blocks_kernel_size=blocks_kernel_size,
            p_dropout=p_dropout,
        )
        self.blocks = QuartzBlocks(
            num_blocks=num_blocks,
            block_repetition=block_repetition,
            num_sub_blocks=num_sub_blocks,
            in_channels=prelog_n_channels,
            channels_size=channels_size,
            kernel_size=blocks_kernel_size,
            groups=groups,
            p_dropout=p_dropout,
        )
        self.epilog1 = JasperSubBlock(
            in_channels=channels_size[-1],
            out_channels=epilog_channel_size[0],
            kernel_size=epilog_kernel_size,
            p_dropout=p_dropout,
        )
        self.epilog2 = JasperSubBlock(
            in_channels=epilog_channel_size[0],
            out_channels=epilog_channel_size[1],
            kernel_size=1,
            p_dropout=p_dropout,
        )


[docs]class SqueezeformerEncoder(nn.Module):
    """Implements the Squeezeformer encoder
    as described in https://arxiv.org/abs/2206.00888

    Args:

        in_features (int): The input/speech feature size.

        n (int): The number of layers per block, (denoted as N in the paper).

        d_model (int): The model dimension.

        ff_expansion_factor (int): The expansion factor of linear layer in the
        feed forward module.

        h (int): The number of attention heads.

        kernel_size (int): The kernel size of the depth-wise convolution layer.

        pooling_kernel_size (int): The kernel size of the pooling convolution layer.

        pooling_stride (int): The stride size of the pooling convolution layer.

        ss_kernel_size (Union[int, List[int]]): The kernel size of the subsampling layer(s).

        ss_stride (Union[int, List[int]]): The stride of the subsampling layer(s).

        ss_n_conv_layers (int): The number of subsampling convolutional layers.

        p_dropout (float): The dropout rate.

        ss_groups (Union[int, List[int]]): The subsampling convolution groups size(s).

        masking_value (int): The masking value. Default -1e15
    """

    def __init__(
        self,
        in_features: int,
        n: int,
        d_model: int,
        ff_expansion_factor: int,
        h: int,
        kernel_size: int,
        pooling_kernel_size: int,
        pooling_stride: int,
        ss_kernel_size: Union[int, List[int]],
        ss_stride: Union[int, List[int]],
        ss_n_conv_layers: int,
        p_dropout: float,
        ss_groups: Union[int, List[int]] = 1,
        masking_value: int = -1e15,
    ) -> None:
        super().__init__()
        self.subsampling = ConformerPreNet(
            in_features=in_features,
            kernel_size=ss_kernel_size,
            stride=ss_stride,
            n_conv_layers=ss_n_conv_layers,
            d_model=d_model,
            p_dropout=p_dropout,
            groups=ss_groups,
        )
        self.layers1 = nn.ModuleList(
            [
                SqueezeformerBlock(
                    d_model=d_model,
                    ff_expansion_factor=ff_expansion_factor,
                    h=h,
                    kernel_size=kernel_size,
                    p_dropout=p_dropout,
                    masking_value=masking_value,
                )
                for _ in range(n - 1)
            ]
        )
        self.pooling = nn.Conv1d(
            in_channels=d_model,
            out_channels=d_model,
            kernel_size=pooling_kernel_size,
            stride=pooling_stride,
            groups=d_model,
        )
        self.layers2 = nn.ModuleList(
            [
                SqueezeformerBlock(
                    d_model=d_model,
                    ff_expansion_factor=ff_expansion_factor,
                    h=h,
                    kernel_size=kernel_size,
                    p_dropout=p_dropout,
                    masking_value=masking_value,
                )
                for _ in range(n)
            ]
        )
        self.upsampling_conv = nn.ConvTranspose1d(
            in_channels=d_model,
            out_channels=d_model,
            kernel_size=pooling_kernel_size,
            stride=pooling_stride,
        )
        self.sf_layer = SqueezeformerBlock(
            d_model=d_model,
            ff_expansion_factor=ff_expansion_factor,
            h=h,
            kernel_size=kernel_size,
            p_dropout=p_dropout,
            masking_value=masking_value,
        )

    def _pass_through_layers(
        self, x: Tensor, mask: Tensor, layers: nn.ModuleList
    ) -> Tensor:
        for layer in layers:
            x = layer(x, mask)
        return x

    def _upsample(self, x: Tensor, target_len: int):
        # x of shape [B, M, d]
        x = x.transpose(-1, -2)
        out = self.upsampling_conv(x)
        res_len = target_len - out.shape[-1]
        out = torch.cat([out, torch.zeros(*x.shape[:2], res_len).to(x.device)], dim=-1)
        out = out.transpose(-1, -2)
        return out

    def _time_pooling(self, x: Tensor):
        x = x.transpose(-1, -2)
        out = self.pooling(x)
        out = out.transpose(-1, -2)
        return out

[docs]    def forward(
        self, x: Tensor, mask: Tensor, *args, **kwargs
    ) -> Tuple[Tensor, Tensor]:
        """Passes the input `x` through the encoder layers.

        Args:

            x (Tensor): The input speech tensor of shape [B, M, d]

            mask (Tensor): The input boolean input mask of shape [B, M], where it's True
            if there is no padding.

        Returns:

            Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape
            [B, M, F] and the second element is the lengths of shape [B].
        """
        lengths = mask.sum(dim=-1)
        out, lengths = self.subsampling(x, lengths)
        mask = get_mask_from_lens(lengths=lengths, max_len=out.shape[1])
        out = self._pass_through_layers(out, mask, self.layers1)
        result = self._time_pooling(out)
        pooled_len = calc_data_len(
            result_len=result.shape[1],
            pad_len=out.shape[1] - lengths,
            data_len=lengths,
            kernel_size=self.pooling.kernel_size[0],
            stride=self.pooling.stride[0],
        )
        pooled_mask = get_mask_from_lens(lengths=pooled_len, max_len=result.shape[1])
        result = self._pass_through_layers(result, pooled_mask, self.layers2)
        result = self._upsample(result, out.shape[1])
        out = result + out
        out = self.sf_layer(out, mask)
        return out, lengths


[docs]class SpeechTransformerEncoder(nn.Module):
    """Implements the speech transformer encoder
    described in https://ieeexplore.ieee.org/document/8462506

    Args:

        in_features (int): The input/speech feature size.

        n_conv_layers (int): The number of down-sampling convolutional layers.

        kernel_size (int): The kernel size of the down-sampling convolutional layers.

        stride (int): The stride size of the down-sampling convolutional layers.

        d_model (int): The model dimensionality.

        n_layers (int): The number of encoder layers.

        ff_size (int):  The dimensionality of the inner layer of the feed-forward module.

        h (int): The number of attention heads.

        att_kernel_size (int): The kernel size of the attentional convolutional layers.

        att_out_channels (int): The number of output channels of the attentional convolution layers.
    """

    def __init__(
        self,
        in_features: int,
        n_conv_layers: int,
        kernel_size: int,
        stride: int,
        d_model: int,
        n_layers: int,
        ff_size: int,
        h: int,
        att_kernel_size: int,
        att_out_channels: int,
    ) -> None:
        super().__init__()
        self.conv_layers = nn.ModuleList(
            [
                torch.nn.Conv2d(
                    in_channels=1,
                    out_channels=1,
                    kernel_size=kernel_size,
                    stride=stride,
                )
                for _ in range(n_conv_layers)
            ]
        )
        self.relu = nn.ReLU()
        for _ in range(n_conv_layers):
            in_features = (in_features - kernel_size) // stride + 1
        self.fc = nn.Linear(in_features=in_features, out_features=d_model)
        self.layers = nn.ModuleList(
            [
                SpeechTransformerEncLayer(
                    d_model=d_model,
                    ff_size=ff_size,
                    h=h,
                    out_channels=att_out_channels,
                    kernel_size=att_kernel_size,
                )
                for _ in range(n_layers)
            ]
        )
        self.layer_norm = nn.LayerNorm(normalized_shape=d_model)
        self.d_model = d_model

    def _pre_process(self, x: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
        x = x.unsqueeze(dim=1)  # B, 1, M, d
        lengths = mask.sum(dim=-1)
        for layer in self.conv_layers:
            length = x.shape[-2]
            x = layer(x)
            lengths = calc_data_len(
                result_len=x.shape[-2],
                pad_len=length - lengths,
                data_len=lengths,
                kernel_size=layer.kernel_size[0],
                stride=layer.stride[0],
            )
            x = self.relu(x)
        x = x.squeeze(dim=1)
        x = self.fc(x)
        x = add_pos_enc(x)
        mask = get_mask_from_lens(lengths=lengths, max_len=x.shape[1])
        mask = mask.to(x.device)
        return x, mask

[docs]    def forward(
        self, x: Tensor, mask: Tensor, *args, **kwargs
    ) -> Tuple[Tensor, Tensor]:
        """Passes the input `x` through the encoder layers.

        Args:

            x (Tensor): The input speech tensor of shape [B, M, d]

            mask (Tensor): The input boolean input mask of shape [B, M], where it's True
            if there is no padding.

        Returns:

            Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape
            [B, M, F] and the second element is the lengths of shape [B].
        """
        out, mask = self._pre_process(x, mask)
        for layer in self.layers:
            out = layer(out, mask)
        lengths = mask.sum(dim=-1)
        out = self.layer_norm(out)
        return out, lengths


[docs]class RNNEncoder(nn.Module):
    """Implements a stack of RNN layers.

    Args:

        in_features (int): The input features size.

        hidden_size (int): The RNN hidden size.

        bidirectional (bool): A flag indicating if the rnn is bidirectional or not.

        n_layers (int): The number of RNN layers.

        p_dropout (float): The dropout rate.

        rnn_type (str): The RNN type it has to be one of rnn, gru or lstm.

    """

    def __init__(
        self,
        in_features: int,
        hidden_size: int,
        bidirectional: bool,
        n_layers: int,
        p_dropout: float,
        rnn_type: str = "rnn",
    ) -> None:
        super().__init__()
        from .registry import PACKED_RNN_REGISTRY

        if bidirectional is True:
            assert hidden_size % 2 == 0
        self.rnns = nn.ModuleList(
            [
                PACKED_RNN_REGISTRY[rnn_type](
                    input_size=in_features if i == 0 else hidden_size,
                    hidden_size=hidden_size // 2 if bidirectional else hidden_size,
                    batch_first=True,
                    enforce_sorted=False,
                    bidirectional=bidirectional,
                )
                for i in range(n_layers)
            ]
        )
        self.dropout = nn.Dropout(p_dropout)
        self.n_layers = n_layers

[docs]    def forward(
        self, x: Tensor, mask: Tensor, return_h=False, *args, **kwargs
    ) -> Tuple[Tensor, Tensor, Tensor]:
        """Passes the input `x` through the encoder layers.

        Args:

            x (Tensor): The input speech tensor of shape [B, M, d]

            mask (Tensor): The input boolean input mask of shape [B, M], where it's True
            if there is no padding.

        Returns:

            Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape
            [B, M, F] and the second element is the lengths of shape [B].
        """
        out = x
        lengths = mask.sum(dim=-1).cpu()
        for i, layer in enumerate(self.rnns):
            out, h, lengths = layer(out, lengths)
            if (i + 1) != self.n_layers:
                out = self.dropout(out)
        if return_h is True:
            return out, h, lengths
        return out, lengths


[docs]class PyramidRNNEncoder(nn.Module):
    """Implements a pyramid of RNN as described in
    https://arxiv.org/abs/1508.01211.

    Args:

        in_features (int): The input features size.

        hidden_size (int): The RNN hidden size.

        reduction_factor (int): The time resolution reduction factor.

        bidirectional (bool): A flag indicating if the rnn is bidirectional or not.

        n_layers (int): The number of RNN layers.

        p_dropout (float): The dropout rate.

        rnn_type (str): The RNN type it has to be one of rnn, gru or lstm.

    """

    def __init__(
        self,
        in_features: int,
        hidden_size: int,
        reduction_factor: int,
        bidirectional: bool,
        n_layers: int,
        p_dropout: float,
        rnn_type: str = "rnn",
    ) -> None:
        super().__init__()
        self.reduction_factor = reduction_factor
        from .registry import PACKED_RNN_REGISTRY

        if bidirectional is True:
            assert hidden_size % 2 == 0
        if bidirectional is True:
            hidden_size = hidden_size // 2
        self.rnns = nn.ModuleList(
            [
                PACKED_RNN_REGISTRY[rnn_type](
                    input_size=in_features,
                    hidden_size=hidden_size,
                    batch_first=True,
                    enforce_sorted=False,
                    bidirectional=bidirectional,
                )
            ]
        )
        for _ in range(n_layers - 1):
            inp_size = (1 + bidirectional) * hidden_size * reduction_factor
            self.rnns.append(
                PACKED_RNN_REGISTRY[rnn_type](
                    input_size=inp_size,
                    hidden_size=hidden_size,
                    batch_first=True,
                    enforce_sorted=False,
                    bidirectional=bidirectional,
                )
            )
        self.dropout = nn.Dropout(p_dropout)
        self.n_layers = n_layers

    def _reduce(self, x: Tensor) -> Tensor:
        # x of shape [B, M, d]
        max_len = x.shape[1]
        assert max_len > self.reduction_factor
        # making sure it's divisible by the reduction factor
        res_len = max_len % self.reduction_factor
        res_len = self.reduction_factor if res_len == 0 else res_len
        pad_len = self.reduction_factor - res_len
        # adding trailing zeros to make the sequence divisible
        x = torch.cat(
            [x, torch.zeros(x.shape[0], pad_len, x.shape[-1]).to(x.device)], dim=1
        )
        x = x.view(x.shape[0], x.shape[1] // self.reduction_factor, -1)
        return x

[docs]    def forward(
        self, x: Tensor, mask: Tensor, return_h=False, *args, **kwargs
    ) -> Tuple[Tensor, Tensor, Tensor]:
        """Passes the input `x` through the encoder layers.

        Args:

            x (Tensor): The input speech tensor of shape [B, M, d]

            mask (Tensor): The input boolean input mask of shape [B, M], where it's True
            if there is no padding.

        Returns:

            Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape
            [B, M, F] and the second element is the lengths of shape [B].
        """
        out = x
        lengths = mask.sum(dim=-1).cpu()
        for i, layer in enumerate(self.rnns):
            out, h, lengths = layer(out, lengths)
            if (i + 1) != self.n_layers:
                out = self._reduce(out)
                lengths = torch.ceil(lengths / self.reduction_factor)
                out = self.dropout(out)
        lengths = lengths.long()
        if return_h is True:
            return out, h, lengths
        return out, lengths


[docs]class ContextNetEncoder(nn.Module):
    """Implements the ContextNet encoder proposed in
    https://arxiv.org/abs/2005.03191

    Args:
        in_features (int): The input feature size.

        n_layers (int): The number of ContextNet blocks.

        n_sub_layers (Union[int, List[int]]): The number of convolutional
        layers per block. If list is passed, it has to be of length equal to `n_layers`.

        stride (Union[int, List[int]]): The stride of the last convolutional
        layers per block. If list is passed, it has to be of length equal to
        `n_layers`.

        out_channels (Union[int, List[int]]): The channels size of the
        convolutional layers per block. If list is passed, it has to be of
        length equal to `n_layers`.

        kernel_size (int): The convolutional layers kernel size.

        reduction_factor (int): The feature reduction size of the Squeeze-and-excitation module.
    """

    def __init__(
        self,
        in_features: int,
        n_layers: int,
        n_sub_layers: Union[int, List[int]],
        stride: Union[int, List[int]],
        out_channels: Union[int, List[int]],
        kernel_size: int,
        reduction_factor: int,
    ) -> None:
        super().__init__()
        self.layers = nn.ModuleList([])
        for i in range(n_layers):
            if i == 0:
                self.layers.append(
                    ContextNetBlock(
                        n_layers=1,
                        in_channels=in_features,
                        out_channels=out_channels[i]
                        if isinstance(out_channels, list)
                        else out_channels,
                        kernel_size=kernel_size,
                        reduction_factor=reduction_factor,
                        add_residual=False,
                        last_layer_stride=1,
                    )
                )
            elif i == n_layers - 1:
                self.layers.append(
                    ContextNetBlock(
                        n_layers=1,
                        in_channels=out_channels[i - 1]
                        if isinstance(out_channels, list)
                        else out_channels,
                        out_channels=out_channels[i]
                        if isinstance(out_channels, list)
                        else out_channels,
                        kernel_size=kernel_size,
                        reduction_factor=reduction_factor,
                        add_residual=False,
                        last_layer_stride=1,
                    )
                )
            else:
                self.layers.append(
                    ContextNetBlock(
                        n_layers=n_sub_layers[i]
                        if isinstance(n_sub_layers, list)
                        else n_sub_layers,
                        in_channels=out_channels[i - 1]
                        if isinstance(out_channels, list)
                        else out_channels,
                        out_channels=out_channels[i]
                        if isinstance(out_channels, list)
                        else out_channels,
                        kernel_size=kernel_size,
                        reduction_factor=reduction_factor,
                        add_residual=True,
                        last_layer_stride=stride[i]
                        if isinstance(stride, list)
                        else stride,
                    )
                )

[docs]    def forward(
        self, x: Tensor, mask: Tensor, *args, **kwargs
    ) -> Tuple[Tensor, Tensor]:
        """Passes the input `x` through the encoder layers.

        Args:

            x (Tensor): The input speech tensor of shape [B, M, d]

            mask (Tensor): The input boolean input mask of shape [B, M], where it's True
            if there is no padding.

        Returns:

            Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape
            [B, M, F] and the second element is the lengths of shape [B].
        """
        lengths = mask.sum(dim=-1)
        out = x.transpose(-1, -2)  # [B, d, M]
        for layer in self.layers:
            out, lengths = layer(out, lengths)
        out = out.transpose(-1, -2)  # [B, M, d']
        return out, lengths


[docs]class VGGTransformerEncoder(nn.Module):
    """Implements the VGGTransformer encoder as described in
    https://arxiv.org/abs/1910.12977

    Args:

        in_features (int): The input feature size.

        n_layers (int): The number of transformer encoder layers with truncated
        self attention.

        n_vgg_blocks (int): The number of VGG blocks to use.

        n_conv_layers_per_vgg_block (List[int]): A list of integers that specifies the number
        of convolution layers in each block.

        kernel_sizes_per_vgg_block (List[List[int]]): A list of lists that contains the
        kernel size for each layer in each block. The length of the outer list
        should match `n_vgg_blocks`, and each inner list should be the same length
        as the corresponding block's number of layers.

        n_channels_per_vgg_block (List[List[int]]): A list of lists that contains the
        number of channels for each convolution layer in each block. This argument
        should also have length equal to `n_vgg_blocks`, and each sublist should
        have length equal to the number of layers in the corresponding block.

        vgg_pooling_kernel_size (List[int]): A list of integers that specifies the size
        of the max pooling layer in each block. The length of this list should be
        equal to `n_vgg_blocks`.

        d_model (int): The model dimensionality.

        ff_size (int): The feed forward inner layer dimensionality.

        h (int): The number of heads in the attention mechanism.

        left_size (int): The size of the left window that each time step is
        allowed to look at.

        right_size (int): The size of the right window that each time step is
        allowed to look at.

        masking_value (float, optional): The value to use for masking padded
        elements. Defaults to -1e15.
    """

    def __init__(
        self,
        in_features: int,
        n_layers: int,
        n_vgg_blocks: int,
        n_conv_layers_per_vgg_block: List[int],
        kernel_sizes_per_vgg_block: List[List[int]],
        n_channels_per_vgg_block: List[List[int]],
        vgg_pooling_kernel_size: List[int],
        d_model: int,
        ff_size: int,
        h: int,
        left_size: int,
        right_size: int,
        masking_value: int = -1e15,
    ) -> None:
        super().__init__()
        self.pre_net = VGGTransformerPreNet(
            in_features=in_features,
            n_vgg_blocks=n_vgg_blocks,
            n_layers_per_block=n_conv_layers_per_vgg_block,
            kernel_sizes_per_block=kernel_sizes_per_vgg_block,
            n_channels_per_block=n_channels_per_vgg_block,
            pooling_kernel_size=vgg_pooling_kernel_size,
            d_model=d_model,
        )
        self.enc_layers = nn.ModuleList(
            [
                TransformerEncLayerWithAttTruncation(
                    d_model=d_model,
                    ff_size=ff_size,
                    h=h,
                    left_size=left_size,
                    right_size=right_size,
                    masking_value=masking_value,
                )
                for _ in range(n_layers)
            ]
        )

[docs]    def forward(self, x: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
        """Passes the input `x` through the encoder layers.

        Args:

            x (Tensor): The input speech tensor of shape [B, M, d]

            mask (Tensor): The input boolean mask of shape [B, M], where it's True
            if there is no padding.

        Returns:

            Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape
            [B, M, F] and the second element is the lengths of shape [B].
        """
        lengths = mask.sum(dim=-1)
        out, lengths = self.pre_net(x, lengths)
        mask = get_mask_from_lens(lengths=lengths, max_len=out.shape[1])
        for layer in self.enc_layers:
            out = layer(out, mask)
        return out, lengths


[docs]class TransformerTransducerEncoder(nn.Module):
    """Implements the Transformer-Transducer encoder with relative truncated
    multi-head self attention as described in https://arxiv.org/abs/2002.02562

    Args:

        in_features (int): The input feature size.

        n_layers (int): The number of transformer encoder layers with truncated
        self attention and relative positional encoding.

        d_model (int): The model dimensionality.

        ff_size (int): The feed forward inner layer dimensionality.

        h (int): The number of heads in the attention mechanism.

        left_size (int): The size of the left window that each time step is
        allowed to look at.

        right_size (int): The size of the right window that each time step is
        allowed to look at.

        p_dropout (float): The dropout rate.

        stride (int): The stride of the convolution layer. Default 1.

        kernel_size (int): The kernel size of the convolution layer. Default 1.

        masking_value (float, optional): The value to use for masking padded
        elements. Defaults to -1e15.
    """

    def __init__(
        self,
        in_features: int,
        n_layers: int,
        d_model: int,
        ff_size: int,
        h: int,
        left_size: int,
        right_size: int,
        p_dropout: float,
        stride: int = 1,
        kernel_size: int = 1,
        masking_value: int = -1e15,
    ) -> None:
        super().__init__()
        self.pre_net = nn.Conv1d(
            in_channels=in_features,
            out_channels=d_model,
            kernel_size=kernel_size,
            stride=stride,
        )
        self.enc_layers = nn.ModuleList(
            [
                TransformerTransducerLayer(
                    d_model=d_model,
                    ff_size=ff_size,
                    h=h,
                    left_size=left_size,
                    right_size=right_size,
                    p_dropout=p_dropout,
                    masking_value=masking_value,
                )
                for _ in range(n_layers)
            ]
        )

[docs]    def forward(self, x: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
        """Passes the input `x` through the encoder layers.

        Args:

            x (Tensor): The input speech tensor of shape [B, M, d]

            mask (Tensor): The input boolean mask of shape [B, M], where it's True
            if there is no padding.

        Returns:

            Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape
            [B, M, F] and the second element is the lengths of shape [B].
        """
        lengths = mask.sum(dim=-1)
        out = x.transpose(-1, -2)
        out = self.pre_net(out)
        out = out.transpose(-1, -2)
        lengths = calc_data_len(
            result_len=out.shape[1],
            pad_len=x.shape[1] - lengths,
            data_len=lengths,
            kernel_size=self.pre_net.kernel_size[0],
            stride=self.pre_net.stride[0],
        )
        mask = get_mask_from_lens(lengths=lengths, max_len=out.shape[1])
        for layer in self.enc_layers:
            out = layer(out, mask)
        return out, lengths