Source code for speeq.models.encoders

"""This module provides various speech encoders.

The available encoders are:

- DeepSpeechV1Encoder: The encoder implementation of the DeepSpeech V1 model.
- DeepSpeechV2Encoder: The encoder implementation of the DeepSpeech V2 model.
- ConformerEncoder: The encoder implementation of the Conformer model.
- JasperEncoder: The encoder implementation of the Jasper model.
- Wav2LetterEncoder: The encoder implementation of the Wav2Letter model.
- QuartzNetEncoder: The encoder implementation of the QuartzNet model.
- SqueezeformerEncoder: The encoder implementation of the Squeezeformer model.
- SpeechTransformerEncoder: The encoder implementation of the Speech Transformer model.
- RNNEncoder: The encoder implementation of a general RNN model.
- PyramidRNNEncoder: The encoder implementation of the Pyramid RNN model.
- ContextNetEncoder: The encoder implementation of the ContextNet model.
- VGGTransformerEncoder: The encoder implementation of the VGG-Transformer.
- TransformerTransducerEncoder: The encoder implementation of the transformer transducer with relative truncated multi-head self-attention.

Each encoder takes a speech input of shape [B, M, d], and the lengths if
shape [B], where B is the batch size, M is the length of
the speech sequence, and d is the number of features.
"""
from typing import List, Optional, Tuple, Union

import torch
from torch import Tensor, nn

from speeq.utils.utils import add_pos_enc, calc_data_len, get_mask_from_lens

from .activations import CReLu
from .layers import (
    ConformerBlock,
    ConformerPreNet,
    ContextNetBlock,
    Conv1DLayers,
    JasperBlocks,
    JasperSubBlock,
    QuartzBlocks,
    RowConv1D,
    SpeechTransformerEncLayer,
    SqueezeformerBlock,
    TransformerEncLayer,
    TransformerEncLayerWithAttTruncation,
    TransformerTransducerLayer,
    VGGTransformerPreNet,
)


[docs]class DeepSpeechV1Encoder(nn.Module): """Builds the DeepSpeech encoder described in https://arxiv.org/abs/1412.5567 Args: in_features (int): The input feature size. hidden_size (int): The hidden size of the rnn layers. n_linear_layers (int): The number of feed-forward layers. bidirectional (bool): if the rnn is bidirectional or not. max_clip_value (int): The maximum relu clipping value. rnn_type (str): The RNN type it has to be one of rnn, gru or lstm. p_dropout (float): The dropout rate. """ def __init__( self, in_features: int, hidden_size: int, n_linear_layers: int, bidirectional: bool, max_clip_value: int, rnn_type: str, p_dropout: float, ) -> None: super().__init__() self.ff_layers = nn.ModuleList( [ nn.Sequential( nn.Linear( in_features=in_features if i == 0 else hidden_size, out_features=hidden_size, ), CReLu(max_val=max_clip_value), nn.Dropout(p=p_dropout), ) for i in range(n_linear_layers) ] ) from .registry import PACKED_RNN_REGISTRY self.rnn = PACKED_RNN_REGISTRY[rnn_type]( input_size=hidden_size, hidden_size=hidden_size, bidirectional=bidirectional ) self.fc = nn.Linear( in_features=hidden_size, out_features=hidden_size, ) self.crelu = CReLu(max_val=max_clip_value) self.bidirectional = bidirectional self.hidden_size = hidden_size
[docs] def forward( self, x: Tensor, mask: Tensor, *args, **kwargs ) -> Tuple[Tensor, Tensor]: """Passes the input `x` through the encoder layers. Args: x (Tensor): The input speech tensor of shape [B, M, d] mask (Tensor): The input boolean input mask of shape [B, M], where it's True if there is no padding. Returns: Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape [B, M, F] and the second element is the lengths of shape [B]. """ lengths = mask.sum(dim=-1) for layer in self.ff_layers: x = layer(x) out, _, lengths = self.rnn(x, lengths.cpu()) out = self.crelu(out) if self.bidirectional is True: out = out[..., : self.hidden_size] + out[..., self.hidden_size :] out = self.crelu(self.fc(out)) return out, lengths
[docs]class DeepSpeechV2Encoder(nn.Module): """Implements the deep speech 2 encoder proposed in https://arxiv.org/abs/1512.02595 Args: n_conv (int): The number of convolution layers. kernel_size (int): The kernel size of the convolution layers. stride (int): The stride size of the convolution layer. in_features (int): The input/speech feature size. hidden_size (int): The hidden size of the RNN layers. bidirectional (bool): A flag indicating if the rnn is bidirectional or not. n_rnn (int): The number of RNN layers. n_linear_layers (int): The number of linear layers. max_clip_value (int): The maximum relu clipping value. rnn_type (str): The RNN type it has to be one of rnn, gru or lstm. tau (int): The future context size. p_dropout (float): The dropout rate. """ def __init__( self, n_conv: int, kernel_size: int, stride: int, in_features: int, hidden_size: int, bidirectional: bool, n_rnn: int, n_linear_layers: int, max_clip_value: int, rnn_type: str, tau: int, p_dropout: float, ) -> None: super().__init__() self.conv = Conv1DLayers( in_size=in_features, out_size=hidden_size, kernel_size=kernel_size, stride=stride, n_layers=n_conv, p_dropout=p_dropout, activation=CReLu(max_val=max_clip_value), ) from .registry import PACKED_RNN_REGISTRY self.rnns = nn.ModuleList( [ PACKED_RNN_REGISTRY[rnn_type]( input_size=hidden_size, hidden_size=hidden_size, bidirectional=bidirectional, ) for _ in range(n_rnn) ] ) self.rnn_bnorms = nn.ModuleList( [nn.BatchNorm1d(num_features=hidden_size) for _ in range(n_rnn)] ) self.linear_layers = nn.ModuleList( [ nn.Linear(in_features=hidden_size, out_features=hidden_size) for _ in range(n_linear_layers) ] ) self.linear_bnorms = nn.ModuleList( [nn.BatchNorm1d(num_features=hidden_size) for _ in range(n_linear_layers)] ) self.crelu = CReLu(max_val=max_clip_value) self.context_conv = RowConv1D(tau=tau, feat_size=hidden_size) self.hidden_size = hidden_size self.bidirectional = bidirectional
[docs] def forward( self, x: Tensor, mask: Tensor, *args, **kwargs ) -> Tuple[Tensor, Tensor]: """Passes the input `x` through the encoder layers. Args: x (Tensor): The input speech tensor of shape [B, M, d] mask (Tensor): The input boolean input mask of shape [B, M], where it's True if there is no padding. Returns: Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape [B, M, F] and the second element is the lengths of shape [B]. """ lengths = mask.sum(dim=-1) lengths = lengths.cpu() out, lengths = self.conv(x, lengths) out = self.crelu(out) for bnorm, layer in zip(self.rnn_bnorms, self.rnns): out = out.transpose(-1, -2) out = bnorm(out) out = out.transpose(-1, -2) out, _, lengths = layer(out, lengths) if self.bidirectional is True: out = out[..., : self.hidden_size] + out[..., self.hidden_size :] out = self.crelu(out) out = self.context_conv(out) for bnorm, layer in zip(self.linear_bnorms, self.linear_layers): out = layer(out) out = out.transpose(-1, -2) out = bnorm(out) out = out.transpose(-1, -2) out = self.crelu(out) return out, lengths
[docs]class ConformerEncoder(nn.Module): """Implements the conformer encoder proposed in https://arxiv.org/abs/2005.08100 Args: d_model (int): The model dimension. n_conf_layers (int): The number of conformer blocks. ff_expansion_factor (int): The feed-forward expansion factor. h (int): The number of attention heads. kernel_size (int): The convolution module kernel size. ss_kernel_size (int): The subsampling layer kernel size. ss_stride (int): The subsampling layer stride size. ss_num_conv_layers (int): The number of subsampling convolutional layers. in_features (int): The input/speech feature size. res_scaling (float): The residual connection multiplier. p_dropout (float): The dropout rate. """ def __init__( self, d_model: int, n_conf_layers: int, ff_expansion_factor: int, h: int, kernel_size: int, ss_kernel_size: int, ss_stride: int, ss_num_conv_layers: int, in_features: int, res_scaling: float, p_dropout: float, ) -> None: super().__init__() self.sub_sampling = ConformerPreNet( in_features=in_features, kernel_size=ss_kernel_size, stride=ss_stride, n_conv_layers=ss_num_conv_layers, d_model=d_model, p_dropout=p_dropout, ) self.blocks = nn.ModuleList( [ ConformerBlock( d_model=d_model, ff_expansion_factor=ff_expansion_factor, h=h, kernel_size=kernel_size, p_dropout=p_dropout, res_scaling=res_scaling, ) for _ in range(n_conf_layers) ] )
[docs] def forward( self, x: Tensor, mask: Tensor, *args, **kwargs ) -> Tuple[Tensor, Tensor]: """Passes the input `x` through the encoder layers. Args: x (Tensor): The input speech tensor of shape [B, M, d] mask (Tensor): The input boolean input mask of shape [B, M], where it's True if there is no padding. Returns: Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape [B, M, F] and the second element is the lengths of shape [B]. """ lengths = mask.sum(dim=-1) lengths = lengths.cpu() out, lengths = self.sub_sampling(x, lengths) mask = get_mask_from_lens(lengths, lengths.max().item()) mask = mask.to(x.device) for layer in self.blocks: out = layer(out, mask) return out, lengths
[docs]class JasperEncoder(nn.Module): """Implements Jasper's encoder proposed in https://arxiv.org/abs/1904.03288 Args: in_features (int): The input/speech feature size. num_blocks (int): The number of Jasper blocks (denoted as 'B' in the paper). num_sub_blocks (int): The number of Jasper subblocks (denoted as 'R' in the paper). channel_inc (int): The rate to increase the number of channels across the blocks. epilog_kernel_size (int): The kernel size of the epilog block convolution layer. prelog_kernel_size (int): The kernel size of the prelog block ocnvolution layer. prelog_stride (int): The stride size of the prelog block convolution layer. prelog_n_channels (int): The output channnels of the prelog block convolution layer. blocks_kernel_size (Union[int, List[int]]): The kernel size(s) of the convolution layer for each block. p_dropout (float): The dropout rate. """ def __init__( self, in_features: int, num_blocks: int, num_sub_blocks: int, channel_inc: int, epilog_kernel_size: int, prelog_kernel_size: int, prelog_stride: int, prelog_n_channels: int, blocks_kernel_size: Union[int, List[int]], p_dropout: float, ) -> None: super().__init__() self.prelog = JasperSubBlock( in_channels=in_features, out_channels=prelog_n_channels, kernel_size=prelog_kernel_size, p_dropout=p_dropout, padding=0, stride=prelog_stride, ) self.prelog_stride = prelog_stride self.prelog_kernel_size = prelog_kernel_size self.blocks = JasperBlocks( num_blocks=num_blocks, num_sub_blocks=num_sub_blocks, in_channels=prelog_n_channels, channel_inc=channel_inc, kernel_size=blocks_kernel_size, p_dropout=p_dropout, ) self.epilog1 = JasperSubBlock( in_channels=prelog_n_channels + channel_inc * num_blocks, out_channels=prelog_n_channels + channel_inc * (1 + num_blocks), kernel_size=epilog_kernel_size, p_dropout=p_dropout, ) self.epilog2 = JasperSubBlock( in_channels=prelog_n_channels + channel_inc * (1 + num_blocks), out_channels=prelog_n_channels + channel_inc * (2 + num_blocks), kernel_size=1, p_dropout=p_dropout, )
[docs] def forward( self, x: Tensor, mask: Tensor, *args, **kwargs ) -> Tuple[Tensor, Tensor]: """Passes the input `x` through the encoder layers. Args: x (Tensor): The input speech tensor of shape [B, M, d] mask (Tensor): The input boolean input mask of shape [B, M], where it's True if there is no padding. Returns: Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape [B, M, F] and the second element is the lengths of shape [B]. """ lengths = mask.sum(dim=-1) lengths = lengths.cpu() x = x.transpose(-1, -2) out = self.prelog(x) lengths = calc_data_len( result_len=out.shape[-1], pad_len=x.shape[-1] - lengths, data_len=lengths, kernel_size=self.prelog_kernel_size, stride=self.prelog_stride, ) out = self.blocks(out) out = self.epilog1(out) out = self.epilog2(out) out = out.transpose(-1, -2) # [B, M, d'] return out, lengths
[docs]class Wav2LetterEncoder(nn.Module): """Implements the Wav2Letter encoder proposed in https://arxiv.org/abs/1609.03193 Args: in_features (int): The input/speech feature size. n_conv_layers (int): The number of convolution layers. layers_kernel_size (int): The kernel size of the convolution layers. layers_channels_size (int): The number of output channels of each convolution layer. pre_conv_stride (int): The stride of the prenet convolution layer. pre_conv_kernel_size (int): The kernel size of the prenet convolution layer. post_conv_channels_size (int): The number of output channels of the postnet convolution layer. post_conv_kernel_size (int): The kernel size of the postnet convolution layer. p_dropout (float): The dropout rate. wav_kernel_size (Optional[int]): The kernel size of the first layer that processes the wav samples directly if wav is modeled. Default None. wav_stride (Optional[int]): The stride size of the first layer that processes the wav samples directly if wav is modeled. Default None. """ def __init__( self, in_features: int, n_conv_layers: int, layers_kernel_size: int, layers_channels_size: int, pre_conv_stride: int, pre_conv_kernel_size: int, post_conv_channels_size: int, post_conv_kernel_size: int, p_dropout: float, wav_kernel_size: Optional[int] = None, wav_stride: Optional[int] = None, ) -> None: super().__init__() self.is_wav = in_features == 1 if self.is_wav: assert wav_kernel_size is not None assert wav_stride is not None self.raw_conv = nn.Conv1d( in_channels=1, out_channels=layers_channels_size, kernel_size=wav_kernel_size, stride=wav_stride, ) self.pre_conv = nn.Conv1d( in_channels=layers_channels_size if self.is_wav else in_features, out_channels=layers_channels_size, kernel_size=pre_conv_kernel_size, stride=pre_conv_stride, ) self.convs = nn.ModuleList( [ nn.Conv1d( in_channels=layers_channels_size, out_channels=layers_channels_size, kernel_size=layers_kernel_size, padding="same", ) for _ in range(n_conv_layers - 1) ] ) self.convs.append( nn.Conv1d( in_channels=layers_channels_size, out_channels=post_conv_channels_size, kernel_size=post_conv_kernel_size, padding="same", ) ) self.post_conv = nn.Conv1d( in_channels=post_conv_channels_size, out_channels=post_conv_channels_size, kernel_size=1, padding="same", ) self.dropout = nn.Dropout(p_dropout)
[docs] def forward( self, x: Tensor, mask: Tensor, *args, **kwargs ) -> Tuple[Tensor, Tensor]: """Passes the input `x` through the encoder layers. Args: x (Tensor): The input speech tensor of shape [B, M, d] mask (Tensor): The input boolean input mask of shape [B, M], where it's True if there is no padding. Returns: Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape [B, M, F] and the second element is the lengths of shape [B]. """ lengths = mask.sum(dim=-1) lengths = lengths.cpu() x = x.transpose(-1, -2) out = x if self.is_wav: out = self.raw_conv(out) out = torch.tanh(out) out = self.dropout(out) lengths = calc_data_len( result_len=out.shape[-1], pad_len=x.shape[-1] - lengths, data_len=lengths, kernel_size=self.raw_conv.kernel_size[0], stride=self.raw_conv.stride[0], ) results = self.pre_conv(out) lengths = calc_data_len( result_len=results.shape[-1], pad_len=out.shape[-1] - lengths, data_len=lengths, kernel_size=self.pre_conv.kernel_size[0], stride=self.pre_conv.stride[0], ) out = results out = torch.tanh(out) out = self.dropout(out) for layer in self.convs: out = layer(out) out = torch.tanh(out) out = self.dropout(out) out = self.post_conv(out) out = torch.tanh(out) out = self.dropout(out) out = out.transpose(-1, -2) # [B, M, d] return out, lengths
[docs]class QuartzNetEncoder(JasperEncoder): """Implements QuartzNet encoder proposed in https://arxiv.org/abs/1910.10261 Args: in_features (int): The input/speech feature size. num_blocks (int): The number of QuartzNet blocks (denoted as 'B' in the paper). block_repetition (int): The number of times to repeat each block (denoted as 'S' in the paper). num_sub_blocks (int): The number of QuartzNet subblocks, (denoted as 'R' in the paper). channels_size (List[int]): A list of integers representing the number of output channels for each block. epilog_kernel_size (int): The kernel size of the convolution layer in the epilog block. epilog_channel_size (Tuple[int, int]): A tuple for both epilog layers of the convolution layer . prelog_kernel_size (int): The kernel size pf the convolution layer in the prelog block. prelog_stride (int): The stride size of the of the convoltuional layer in the prelog block. prelog_n_channels (int): The number of output channels of the convolutional layer in the prelog block. groups (int): The groups size. blocks_kernel_size (Union[int, List[int]]): An integer or a list of integers representing the kernel size(s) for each block's convolutional layer. p_dropout (float): The dropout rate. """ def __init__( self, in_features: int, num_blocks: int, block_repetition: int, num_sub_blocks: int, channels_size: List[int], epilog_kernel_size: int, epilog_channel_size: Tuple[int, int], prelog_kernel_size: int, prelog_stride: int, prelog_n_channels: int, groups: int, blocks_kernel_size: Union[int, List[int]], p_dropout: float, ) -> None: super().__init__( in_features=in_features, num_blocks=num_blocks, num_sub_blocks=num_sub_blocks, channel_inc=0, epilog_kernel_size=epilog_kernel_size, prelog_kernel_size=prelog_kernel_size, prelog_stride=prelog_stride, prelog_n_channels=prelog_n_channels, blocks_kernel_size=blocks_kernel_size, p_dropout=p_dropout, ) self.blocks = QuartzBlocks( num_blocks=num_blocks, block_repetition=block_repetition, num_sub_blocks=num_sub_blocks, in_channels=prelog_n_channels, channels_size=channels_size, kernel_size=blocks_kernel_size, groups=groups, p_dropout=p_dropout, ) self.epilog1 = JasperSubBlock( in_channels=channels_size[-1], out_channels=epilog_channel_size[0], kernel_size=epilog_kernel_size, p_dropout=p_dropout, ) self.epilog2 = JasperSubBlock( in_channels=epilog_channel_size[0], out_channels=epilog_channel_size[1], kernel_size=1, p_dropout=p_dropout, )
[docs]class SqueezeformerEncoder(nn.Module): """Implements the Squeezeformer encoder as described in https://arxiv.org/abs/2206.00888 Args: in_features (int): The input/speech feature size. n (int): The number of layers per block, (denoted as N in the paper). d_model (int): The model dimension. ff_expansion_factor (int): The expansion factor of linear layer in the feed forward module. h (int): The number of attention heads. kernel_size (int): The kernel size of the depth-wise convolution layer. pooling_kernel_size (int): The kernel size of the pooling convolution layer. pooling_stride (int): The stride size of the pooling convolution layer. ss_kernel_size (Union[int, List[int]]): The kernel size of the subsampling layer(s). ss_stride (Union[int, List[int]]): The stride of the subsampling layer(s). ss_n_conv_layers (int): The number of subsampling convolutional layers. p_dropout (float): The dropout rate. ss_groups (Union[int, List[int]]): The subsampling convolution groups size(s). masking_value (int): The masking value. Default -1e15 """ def __init__( self, in_features: int, n: int, d_model: int, ff_expansion_factor: int, h: int, kernel_size: int, pooling_kernel_size: int, pooling_stride: int, ss_kernel_size: Union[int, List[int]], ss_stride: Union[int, List[int]], ss_n_conv_layers: int, p_dropout: float, ss_groups: Union[int, List[int]] = 1, masking_value: int = -1e15, ) -> None: super().__init__() self.subsampling = ConformerPreNet( in_features=in_features, kernel_size=ss_kernel_size, stride=ss_stride, n_conv_layers=ss_n_conv_layers, d_model=d_model, p_dropout=p_dropout, groups=ss_groups, ) self.layers1 = nn.ModuleList( [ SqueezeformerBlock( d_model=d_model, ff_expansion_factor=ff_expansion_factor, h=h, kernel_size=kernel_size, p_dropout=p_dropout, masking_value=masking_value, ) for _ in range(n - 1) ] ) self.pooling = nn.Conv1d( in_channels=d_model, out_channels=d_model, kernel_size=pooling_kernel_size, stride=pooling_stride, groups=d_model, ) self.layers2 = nn.ModuleList( [ SqueezeformerBlock( d_model=d_model, ff_expansion_factor=ff_expansion_factor, h=h, kernel_size=kernel_size, p_dropout=p_dropout, masking_value=masking_value, ) for _ in range(n) ] ) self.upsampling_conv = nn.ConvTranspose1d( in_channels=d_model, out_channels=d_model, kernel_size=pooling_kernel_size, stride=pooling_stride, ) self.sf_layer = SqueezeformerBlock( d_model=d_model, ff_expansion_factor=ff_expansion_factor, h=h, kernel_size=kernel_size, p_dropout=p_dropout, masking_value=masking_value, ) def _pass_through_layers( self, x: Tensor, mask: Tensor, layers: nn.ModuleList ) -> Tensor: for layer in layers: x = layer(x, mask) return x def _upsample(self, x: Tensor, target_len: int): # x of shape [B, M, d] x = x.transpose(-1, -2) out = self.upsampling_conv(x) res_len = target_len - out.shape[-1] out = torch.cat([out, torch.zeros(*x.shape[:2], res_len).to(x.device)], dim=-1) out = out.transpose(-1, -2) return out def _time_pooling(self, x: Tensor): x = x.transpose(-1, -2) out = self.pooling(x) out = out.transpose(-1, -2) return out
[docs] def forward( self, x: Tensor, mask: Tensor, *args, **kwargs ) -> Tuple[Tensor, Tensor]: """Passes the input `x` through the encoder layers. Args: x (Tensor): The input speech tensor of shape [B, M, d] mask (Tensor): The input boolean input mask of shape [B, M], where it's True if there is no padding. Returns: Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape [B, M, F] and the second element is the lengths of shape [B]. """ lengths = mask.sum(dim=-1) out, lengths = self.subsampling(x, lengths) mask = get_mask_from_lens(lengths=lengths, max_len=out.shape[1]) out = self._pass_through_layers(out, mask, self.layers1) result = self._time_pooling(out) pooled_len = calc_data_len( result_len=result.shape[1], pad_len=out.shape[1] - lengths, data_len=lengths, kernel_size=self.pooling.kernel_size[0], stride=self.pooling.stride[0], ) pooled_mask = get_mask_from_lens(lengths=pooled_len, max_len=result.shape[1]) result = self._pass_through_layers(result, pooled_mask, self.layers2) result = self._upsample(result, out.shape[1]) out = result + out out = self.sf_layer(out, mask) return out, lengths
[docs]class SpeechTransformerEncoder(nn.Module): """Implements the speech transformer encoder described in https://ieeexplore.ieee.org/document/8462506 Args: in_features (int): The input/speech feature size. n_conv_layers (int): The number of down-sampling convolutional layers. kernel_size (int): The kernel size of the down-sampling convolutional layers. stride (int): The stride size of the down-sampling convolutional layers. d_model (int): The model dimensionality. n_layers (int): The number of encoder layers. ff_size (int): The dimensionality of the inner layer of the feed-forward module. h (int): The number of attention heads. att_kernel_size (int): The kernel size of the attentional convolutional layers. att_out_channels (int): The number of output channels of the attentional convolution layers. """ def __init__( self, in_features: int, n_conv_layers: int, kernel_size: int, stride: int, d_model: int, n_layers: int, ff_size: int, h: int, att_kernel_size: int, att_out_channels: int, ) -> None: super().__init__() self.conv_layers = nn.ModuleList( [ torch.nn.Conv2d( in_channels=1, out_channels=1, kernel_size=kernel_size, stride=stride, ) for _ in range(n_conv_layers) ] ) self.relu = nn.ReLU() for _ in range(n_conv_layers): in_features = (in_features - kernel_size) // stride + 1 self.fc = nn.Linear(in_features=in_features, out_features=d_model) self.layers = nn.ModuleList( [ SpeechTransformerEncLayer( d_model=d_model, ff_size=ff_size, h=h, out_channels=att_out_channels, kernel_size=att_kernel_size, ) for _ in range(n_layers) ] ) self.layer_norm = nn.LayerNorm(normalized_shape=d_model) self.d_model = d_model def _pre_process(self, x: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]: x = x.unsqueeze(dim=1) # B, 1, M, d lengths = mask.sum(dim=-1) for layer in self.conv_layers: length = x.shape[-2] x = layer(x) lengths = calc_data_len( result_len=x.shape[-2], pad_len=length - lengths, data_len=lengths, kernel_size=layer.kernel_size[0], stride=layer.stride[0], ) x = self.relu(x) x = x.squeeze(dim=1) x = self.fc(x) x = add_pos_enc(x) mask = get_mask_from_lens(lengths=lengths, max_len=x.shape[1]) mask = mask.to(x.device) return x, mask
[docs] def forward( self, x: Tensor, mask: Tensor, *args, **kwargs ) -> Tuple[Tensor, Tensor]: """Passes the input `x` through the encoder layers. Args: x (Tensor): The input speech tensor of shape [B, M, d] mask (Tensor): The input boolean input mask of shape [B, M], where it's True if there is no padding. Returns: Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape [B, M, F] and the second element is the lengths of shape [B]. """ out, mask = self._pre_process(x, mask) for layer in self.layers: out = layer(out, mask) lengths = mask.sum(dim=-1) out = self.layer_norm(out) return out, lengths
[docs]class RNNEncoder(nn.Module): """Implements a stack of RNN layers. Args: in_features (int): The input features size. hidden_size (int): The RNN hidden size. bidirectional (bool): A flag indicating if the rnn is bidirectional or not. n_layers (int): The number of RNN layers. p_dropout (float): The dropout rate. rnn_type (str): The RNN type it has to be one of rnn, gru or lstm. """ def __init__( self, in_features: int, hidden_size: int, bidirectional: bool, n_layers: int, p_dropout: float, rnn_type: str = "rnn", ) -> None: super().__init__() from .registry import PACKED_RNN_REGISTRY if bidirectional is True: assert hidden_size % 2 == 0 self.rnns = nn.ModuleList( [ PACKED_RNN_REGISTRY[rnn_type]( input_size=in_features if i == 0 else hidden_size, hidden_size=hidden_size // 2 if bidirectional else hidden_size, batch_first=True, enforce_sorted=False, bidirectional=bidirectional, ) for i in range(n_layers) ] ) self.dropout = nn.Dropout(p_dropout) self.n_layers = n_layers
[docs] def forward( self, x: Tensor, mask: Tensor, return_h=False, *args, **kwargs ) -> Tuple[Tensor, Tensor, Tensor]: """Passes the input `x` through the encoder layers. Args: x (Tensor): The input speech tensor of shape [B, M, d] mask (Tensor): The input boolean input mask of shape [B, M], where it's True if there is no padding. Returns: Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape [B, M, F] and the second element is the lengths of shape [B]. """ out = x lengths = mask.sum(dim=-1).cpu() for i, layer in enumerate(self.rnns): out, h, lengths = layer(out, lengths) if (i + 1) != self.n_layers: out = self.dropout(out) if return_h is True: return out, h, lengths return out, lengths
[docs]class PyramidRNNEncoder(nn.Module): """Implements a pyramid of RNN as described in https://arxiv.org/abs/1508.01211. Args: in_features (int): The input features size. hidden_size (int): The RNN hidden size. reduction_factor (int): The time resolution reduction factor. bidirectional (bool): A flag indicating if the rnn is bidirectional or not. n_layers (int): The number of RNN layers. p_dropout (float): The dropout rate. rnn_type (str): The RNN type it has to be one of rnn, gru or lstm. """ def __init__( self, in_features: int, hidden_size: int, reduction_factor: int, bidirectional: bool, n_layers: int, p_dropout: float, rnn_type: str = "rnn", ) -> None: super().__init__() self.reduction_factor = reduction_factor from .registry import PACKED_RNN_REGISTRY if bidirectional is True: assert hidden_size % 2 == 0 if bidirectional is True: hidden_size = hidden_size // 2 self.rnns = nn.ModuleList( [ PACKED_RNN_REGISTRY[rnn_type]( input_size=in_features, hidden_size=hidden_size, batch_first=True, enforce_sorted=False, bidirectional=bidirectional, ) ] ) for _ in range(n_layers - 1): inp_size = (1 + bidirectional) * hidden_size * reduction_factor self.rnns.append( PACKED_RNN_REGISTRY[rnn_type]( input_size=inp_size, hidden_size=hidden_size, batch_first=True, enforce_sorted=False, bidirectional=bidirectional, ) ) self.dropout = nn.Dropout(p_dropout) self.n_layers = n_layers def _reduce(self, x: Tensor) -> Tensor: # x of shape [B, M, d] max_len = x.shape[1] assert max_len > self.reduction_factor # making sure it's divisible by the reduction factor res_len = max_len % self.reduction_factor res_len = self.reduction_factor if res_len == 0 else res_len pad_len = self.reduction_factor - res_len # adding trailing zeros to make the sequence divisible x = torch.cat( [x, torch.zeros(x.shape[0], pad_len, x.shape[-1]).to(x.device)], dim=1 ) x = x.view(x.shape[0], x.shape[1] // self.reduction_factor, -1) return x
[docs] def forward( self, x: Tensor, mask: Tensor, return_h=False, *args, **kwargs ) -> Tuple[Tensor, Tensor, Tensor]: """Passes the input `x` through the encoder layers. Args: x (Tensor): The input speech tensor of shape [B, M, d] mask (Tensor): The input boolean input mask of shape [B, M], where it's True if there is no padding. Returns: Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape [B, M, F] and the second element is the lengths of shape [B]. """ out = x lengths = mask.sum(dim=-1).cpu() for i, layer in enumerate(self.rnns): out, h, lengths = layer(out, lengths) if (i + 1) != self.n_layers: out = self._reduce(out) lengths = torch.ceil(lengths / self.reduction_factor) out = self.dropout(out) lengths = lengths.long() if return_h is True: return out, h, lengths return out, lengths
[docs]class ContextNetEncoder(nn.Module): """Implements the ContextNet encoder proposed in https://arxiv.org/abs/2005.03191 Args: in_features (int): The input feature size. n_layers (int): The number of ContextNet blocks. n_sub_layers (Union[int, List[int]]): The number of convolutional layers per block. If list is passed, it has to be of length equal to `n_layers`. stride (Union[int, List[int]]): The stride of the last convolutional layers per block. If list is passed, it has to be of length equal to `n_layers`. out_channels (Union[int, List[int]]): The channels size of the convolutional layers per block. If list is passed, it has to be of length equal to `n_layers`. kernel_size (int): The convolutional layers kernel size. reduction_factor (int): The feature reduction size of the Squeeze-and-excitation module. """ def __init__( self, in_features: int, n_layers: int, n_sub_layers: Union[int, List[int]], stride: Union[int, List[int]], out_channels: Union[int, List[int]], kernel_size: int, reduction_factor: int, ) -> None: super().__init__() self.layers = nn.ModuleList([]) for i in range(n_layers): if i == 0: self.layers.append( ContextNetBlock( n_layers=1, in_channels=in_features, out_channels=out_channels[i] if isinstance(out_channels, list) else out_channels, kernel_size=kernel_size, reduction_factor=reduction_factor, add_residual=False, last_layer_stride=1, ) ) elif i == n_layers - 1: self.layers.append( ContextNetBlock( n_layers=1, in_channels=out_channels[i - 1] if isinstance(out_channels, list) else out_channels, out_channels=out_channels[i] if isinstance(out_channels, list) else out_channels, kernel_size=kernel_size, reduction_factor=reduction_factor, add_residual=False, last_layer_stride=1, ) ) else: self.layers.append( ContextNetBlock( n_layers=n_sub_layers[i] if isinstance(n_sub_layers, list) else n_sub_layers, in_channels=out_channels[i - 1] if isinstance(out_channels, list) else out_channels, out_channels=out_channels[i] if isinstance(out_channels, list) else out_channels, kernel_size=kernel_size, reduction_factor=reduction_factor, add_residual=True, last_layer_stride=stride[i] if isinstance(stride, list) else stride, ) )
[docs] def forward( self, x: Tensor, mask: Tensor, *args, **kwargs ) -> Tuple[Tensor, Tensor]: """Passes the input `x` through the encoder layers. Args: x (Tensor): The input speech tensor of shape [B, M, d] mask (Tensor): The input boolean input mask of shape [B, M], where it's True if there is no padding. Returns: Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape [B, M, F] and the second element is the lengths of shape [B]. """ lengths = mask.sum(dim=-1) out = x.transpose(-1, -2) # [B, d, M] for layer in self.layers: out, lengths = layer(out, lengths) out = out.transpose(-1, -2) # [B, M, d'] return out, lengths
[docs]class VGGTransformerEncoder(nn.Module): """Implements the VGGTransformer encoder as described in https://arxiv.org/abs/1910.12977 Args: in_features (int): The input feature size. n_layers (int): The number of transformer encoder layers with truncated self attention. n_vgg_blocks (int): The number of VGG blocks to use. n_conv_layers_per_vgg_block (List[int]): A list of integers that specifies the number of convolution layers in each block. kernel_sizes_per_vgg_block (List[List[int]]): A list of lists that contains the kernel size for each layer in each block. The length of the outer list should match `n_vgg_blocks`, and each inner list should be the same length as the corresponding block's number of layers. n_channels_per_vgg_block (List[List[int]]): A list of lists that contains the number of channels for each convolution layer in each block. This argument should also have length equal to `n_vgg_blocks`, and each sublist should have length equal to the number of layers in the corresponding block. vgg_pooling_kernel_size (List[int]): A list of integers that specifies the size of the max pooling layer in each block. The length of this list should be equal to `n_vgg_blocks`. d_model (int): The model dimensionality. ff_size (int): The feed forward inner layer dimensionality. h (int): The number of heads in the attention mechanism. left_size (int): The size of the left window that each time step is allowed to look at. right_size (int): The size of the right window that each time step is allowed to look at. masking_value (float, optional): The value to use for masking padded elements. Defaults to -1e15. """ def __init__( self, in_features: int, n_layers: int, n_vgg_blocks: int, n_conv_layers_per_vgg_block: List[int], kernel_sizes_per_vgg_block: List[List[int]], n_channels_per_vgg_block: List[List[int]], vgg_pooling_kernel_size: List[int], d_model: int, ff_size: int, h: int, left_size: int, right_size: int, masking_value: int = -1e15, ) -> None: super().__init__() self.pre_net = VGGTransformerPreNet( in_features=in_features, n_vgg_blocks=n_vgg_blocks, n_layers_per_block=n_conv_layers_per_vgg_block, kernel_sizes_per_block=kernel_sizes_per_vgg_block, n_channels_per_block=n_channels_per_vgg_block, pooling_kernel_size=vgg_pooling_kernel_size, d_model=d_model, ) self.enc_layers = nn.ModuleList( [ TransformerEncLayerWithAttTruncation( d_model=d_model, ff_size=ff_size, h=h, left_size=left_size, right_size=right_size, masking_value=masking_value, ) for _ in range(n_layers) ] )
[docs] def forward(self, x: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]: """Passes the input `x` through the encoder layers. Args: x (Tensor): The input speech tensor of shape [B, M, d] mask (Tensor): The input boolean mask of shape [B, M], where it's True if there is no padding. Returns: Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape [B, M, F] and the second element is the lengths of shape [B]. """ lengths = mask.sum(dim=-1) out, lengths = self.pre_net(x, lengths) mask = get_mask_from_lens(lengths=lengths, max_len=out.shape[1]) for layer in self.enc_layers: out = layer(out, mask) return out, lengths
[docs]class TransformerTransducerEncoder(nn.Module): """Implements the Transformer-Transducer encoder with relative truncated multi-head self attention as described in https://arxiv.org/abs/2002.02562 Args: in_features (int): The input feature size. n_layers (int): The number of transformer encoder layers with truncated self attention and relative positional encoding. d_model (int): The model dimensionality. ff_size (int): The feed forward inner layer dimensionality. h (int): The number of heads in the attention mechanism. left_size (int): The size of the left window that each time step is allowed to look at. right_size (int): The size of the right window that each time step is allowed to look at. p_dropout (float): The dropout rate. stride (int): The stride of the convolution layer. Default 1. kernel_size (int): The kernel size of the convolution layer. Default 1. masking_value (float, optional): The value to use for masking padded elements. Defaults to -1e15. """ def __init__( self, in_features: int, n_layers: int, d_model: int, ff_size: int, h: int, left_size: int, right_size: int, p_dropout: float, stride: int = 1, kernel_size: int = 1, masking_value: int = -1e15, ) -> None: super().__init__() self.pre_net = nn.Conv1d( in_channels=in_features, out_channels=d_model, kernel_size=kernel_size, stride=stride, ) self.enc_layers = nn.ModuleList( [ TransformerTransducerLayer( d_model=d_model, ff_size=ff_size, h=h, left_size=left_size, right_size=right_size, p_dropout=p_dropout, masking_value=masking_value, ) for _ in range(n_layers) ] )
[docs] def forward(self, x: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]: """Passes the input `x` through the encoder layers. Args: x (Tensor): The input speech tensor of shape [B, M, d] mask (Tensor): The input boolean mask of shape [B, M], where it's True if there is no padding. Returns: Tuple[Tensor, Tensor]: A tuple where the first element is the encoded speech of shape [B, M, F] and the second element is the lengths of shape [B]. """ lengths = mask.sum(dim=-1) out = x.transpose(-1, -2) out = self.pre_net(out) out = out.transpose(-1, -2) lengths = calc_data_len( result_len=out.shape[1], pad_len=x.shape[1] - lengths, data_len=lengths, kernel_size=self.pre_net.kernel_size[0], stride=self.pre_net.stride[0], ) mask = get_mask_from_lens(lengths=lengths, max_len=out.shape[1]) for layer in self.enc_layers: out = layer(out, mask) return out, lengths