Source code for speeq.models.ctc

"""This module contains various CTC (Connectionist Temporal Classification) models for speech recognition. The CTC models are implemented as subclasses of the base class CTCModel.

Classes:

- CTCModel(nn.Module): Base class for CTC models.
- DeepSpeechV1(CTCModel): DeepSpeech version 1 model.
- BERT(nn.Module): Bidirectional Encoder Representations from Transformers (BERT) model.
- DeepSpeechV2(CTCModel): DeepSpeech version 2 model.
- Conformer(CTCModel): Conformer model.
- Jasper(CTCModel): Jasper model.
- Wav2Letter(CTCModel): Wav2Letter model.
- QuartzNet(CTCModel): QuartzNet model.
- Squeezeformer(CTCModel): Squeezeformer model.
"""
from typing import List, Optional, Tuple, Union

import torch
from torch import Tensor, nn

from .encoders import (
    ConformerEncoder,
    DeepSpeechV1Encoder,
    DeepSpeechV2Encoder,
    JasperEncoder,
    QuartzNetEncoder,
    SqueezeformerEncoder,
    Wav2LetterEncoder,
)
from .layers import ConvPredModule, PredModule, TransformerEncLayer


[docs]class CTCModel(nn.Module):
    """Builds the base of CTC model, if used encoder paramters has to be added,
    otherwise the forward module will raise error.
    """

    def __new__(cls, *args, **kwargs):
        if cls is CTCModel:
            raise NotImplementedError(f"Cannot create object of type `{cls.__name__}`")
        return object.__new__(cls)

    def __init__(self, pred_in_size: int, n_classes: int) -> None:
        super().__init__()
        self.pred_net = PredModule(
            in_features=pred_in_size,
            n_classes=n_classes,
            activation=nn.LogSoftmax(dim=-1),
        )

[docs]    def forward(self, x: Tensor, mask: Tensor, *args, **kwargs):
        """passes the speech input to the model.

        Args:

            x (Tensor): The input speech signal of shape [B, M, d]

            mask (Tensor): The speech mask of shape [B, M], where it's false
            for the positions that contains padding.

        Returns:
            Tuple[Tensor, Tensor]: A tuple where the first is the predictions of shape
            [M, B, C], and the lengths tensor of shape [B].
        """
        out, lengths = self.encoder(x, mask, *args, **kwargs)  # B, M, d
        preds = self.pred_net(out)  # B, M, C
        preds = preds.permute(1, 0, 2)  # M, B, C
        return preds, lengths


[docs]class DeepSpeechV1(CTCModel):
    """Builds the DeepSpeech model described in
    https://arxiv.org/abs/1412.5567

    Args:
        in_features (int): The input feature size.

        hidden_size (int): The hidden size of the rnn layers.

        n_linear_layers (int): The number of feed-forward layers.

        bidirectional (bool): A flag indicating if the rnn is bidirectional or not.

        n_clases (int): The number of classes to predict.

        max_clip_value (int): The maximum relu clipping value.

        rnn_type (str): The RNN type it has to be one of rnn, gru or lstm.

        p_dropout (float): The dropout rate.

    """

    def __init__(
        self,
        in_features: int,
        hidden_size: int,
        n_linear_layers: int,
        bidirectional: bool,
        n_classes: int,
        max_clip_value: int,
        rnn_type: str,
        p_dropout: float,
    ) -> None:
        super().__init__(pred_in_size=hidden_size, n_classes=n_classes)
        self.encoder = DeepSpeechV1Encoder(
            in_features=in_features,
            hidden_size=hidden_size,
            n_linear_layers=n_linear_layers,
            bidirectional=bidirectional,
            max_clip_value=max_clip_value,
            rnn_type=rnn_type,
            p_dropout=p_dropout,
        )

[docs]    @torch.no_grad()
    def predict(self, x: Tensor) -> Tensor:
        # x of shape [1, T, F]
        mask = torch.ones(1, x.shape[1]).long()
        preds, _ = self(x, mask)
        return preds


[docs]class BERT(nn.Module):
    """Implements the BERT Model as
    described in https://arxiv.org/abs/1810.04805

    Args:

        max_len (int): The maximum length for positional encoding.

        in_features (int): The input/speech feature size.

        d_model (int): The model dimensionality.

        h (int): The number of attention heads.

        ff_size (int): The inner size of the feed forward module.

        n_layers (int): The number of transformer encoders.

        n_classes (int): The number of classes.

        p_dropout (float): The dropout rate.
    """

    def __init__(
        self,
        max_len: int,
        in_features: int,
        d_model: int,
        h: int,
        ff_size: int,
        n_layers: int,
        n_classes: int,
        p_dropout: float,
    ) -> None:
        super().__init__()
        self.fc = nn.Linear(
            in_features=in_features,
            out_features=d_model,
        )
        self.pos_emb = nn.Parameter(torch.randn(max_len, d_model))
        self.layers = nn.ModuleList(
            [
                TransformerEncLayer(d_model=d_model, ff_size=ff_size, h=h)
                for _ in range(n_layers)
            ]
        )
        self.pred_module = PredModule(
            in_features=d_model, n_classes=n_classes, activation=nn.LogSoftmax(dim=-1)
        )
        self.dropout = nn.Dropout(p_dropout)

[docs]    def embed(self, x: Tensor, mask: Tensor):
        max_len = mask.sum(dim=-1).max().item()
        emb = self.pos_emb[:max_len]  # M, d
        emb = emb.unsqueeze(dim=0)  # 1, M, d
        emb = emb.repeat(mask.shape[0], 1, 1)  # B, M , d
        mask = mask.unsqueeze(dim=-1)  # B, M, 1
        emb = mask * emb
        return emb + x

[docs]    def forward(self, x: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
        # mask of shape [B, M] and True if there's no padding
        # x of shape [B, T, F]
        lengths = mask.sum(dim=-1)
        out = self.fc(x)
        out = self.embed(out, mask)
        for layer in self.layers:
            out = layer(out, mask)
            out = self.dropout(out)
        preds = self.pred_module(out)
        preds = preds.permute(1, 0, 2)
        return preds, lengths


[docs]class DeepSpeechV2(CTCModel):
    """Implements the deep speech model
    proposed in https://arxiv.org/abs/1512.02595

    Args:
        n_conv (int): The number of convolution layers.

        kernel_size (int): The kernel size of the convolution layers.

        stride (int): The stride size of the convolution layer.

        in_features (int): The input/speech feature size.

        hidden_size (int): The hidden size of the RNN layers.

        bidirectional (bool): A flag indicating if the rnn is bidirectional or not.

        n_rnn (int): The number of RNN layers.

        n_linear_layers (int): The number of linear layers.

        n_classes (int): The number of classes.

        max_clip_value (int): The maximum relu clipping value.

        rnn_type (str): The RNN type it has to be one of rnn, gru or lstm.

        tau (int): The future context size.

        p_dropout (float): The dropout rate.
    """

    def __init__(
        self,
        n_conv: int,
        kernel_size: int,
        stride: int,
        in_features: int,
        hidden_size: int,
        bidirectional: bool,
        n_rnn: int,
        n_linear_layers: int,
        n_classes: int,
        max_clip_value: int,
        rnn_type: str,
        tau: int,
        p_dropout: float,
    ) -> None:
        super().__init__(pred_in_size=hidden_size, n_classes=n_classes)
        self.encoder = DeepSpeechV2Encoder(
            n_conv=n_conv,
            kernel_size=kernel_size,
            stride=stride,
            in_features=in_features,
            hidden_size=hidden_size,
            bidirectional=bidirectional,
            n_rnn=n_rnn,
            n_linear_layers=n_linear_layers,
            max_clip_value=max_clip_value,
            rnn_type=rnn_type,
            tau=tau,
            p_dropout=p_dropout,
        )


[docs]class Conformer(CTCModel):
    """Implements the conformer model proposed in
    https://arxiv.org/abs/2005.08100, this model used
    with CTC, while in the paper used RNN-T.

    Args:

        n_classes (int): The number of classes.

        d_model (int): The model dimension.

        n_conf_layers (int): The number of conformer blocks.

        ff_expansion_factor (int): The feed-forward expansion factor.

        h (int): The number of attention heads.

        kernel_size (int): The convolution module kernel size.

        ss_kernel_size (int): The subsampling layer kernel size.

        ss_stride (int): The subsampling layer stride size.

        ss_num_conv_layers (int): The number of subsampling convolutional layers.

        in_features (int): The input/speech feature size.

        res_scaling (float): The residual connection multiplier.

        p_dropout (float): The dropout rate.
    """

    def __init__(
        self,
        n_classes: int,
        d_model: int,
        n_conf_layers: int,
        ff_expansion_factor: int,
        h: int,
        kernel_size: int,
        ss_kernel_size: int,
        ss_stride: int,
        ss_num_conv_layers: int,
        in_features: int,
        res_scaling: float,
        p_dropout: float,
    ) -> None:
        super().__init__(pred_in_size=d_model, n_classes=n_classes)
        self.encoder = ConformerEncoder(
            d_model=d_model,
            n_conf_layers=n_conf_layers,
            ff_expansion_factor=ff_expansion_factor,
            h=h,
            kernel_size=kernel_size,
            ss_kernel_size=ss_kernel_size,
            ss_stride=ss_stride,
            ss_num_conv_layers=ss_num_conv_layers,
            in_features=in_features,
            res_scaling=res_scaling,
            p_dropout=p_dropout,
        )


[docs]class Jasper(CTCModel):
    """Implements Jasper model architecture proposed
    in https://arxiv.org/abs/1904.03288

    Args:

        n_classes (int): The number of classes.

        in_features (int): The input/speech feature size.

        num_blocks (int): The number of Jasper blocks (denoted as 'B' in the paper).

        num_sub_blocks (int): The number of Jasper subblocks (denoted as 'R' in the paper).

        channel_inc (int): The rate to increase the number of channels across the blocks.

        epilog_kernel_size (int): The kernel size of the epilog block convolution layer.

        prelog_kernel_size (int): The kernel size of the prelog block ocnvolution layer.

        prelog_stride (int): The stride size of the prelog block convolution layer.

        prelog_n_channels (int): The output channnels of the prelog block convolution layer.

        blocks_kernel_size (Union[int, List[int]]): The kernel size(s) of the convolution layer for each block.

        p_dropout (float): The dropout rate.
    """

    def __init__(
        self,
        n_classes: int,
        in_features: int,
        num_blocks: int,
        num_sub_blocks: int,
        channel_inc: int,
        epilog_kernel_size: int,
        prelog_kernel_size: int,
        prelog_stride: int,
        prelog_n_channels: int,
        blocks_kernel_size: Union[int, List[int]],
        p_dropout: float,
    ) -> None:
        super().__init__(1, 1)
        # TODO: Add activation function options
        # TODO: Add normalization options
        # TODO: Add residual connections options
        # TODO: passing dropout list
        self.encoder = JasperEncoder(
            in_features=in_features,
            num_blocks=num_blocks,
            num_sub_blocks=num_sub_blocks,
            channel_inc=channel_inc,
            epilog_kernel_size=epilog_kernel_size,
            prelog_kernel_size=prelog_kernel_size,
            prelog_stride=prelog_stride,
            prelog_n_channels=prelog_n_channels,
            blocks_kernel_size=blocks_kernel_size,
            p_dropout=p_dropout,
        )
        self.pred_net = ConvPredModule(
            in_features=prelog_n_channels + channel_inc * (2 + num_blocks),
            n_classes=n_classes,
            activation=nn.LogSoftmax(dim=-1),
        )


[docs]class Wav2Letter(CTCModel):
    """Implements Wav2Letter model proposed in
    https://arxiv.org/abs/1609.03193

    Args:

        in_features (int): The input/speech feature size.

        n_classes (int): The number of classes.

        n_conv_layers (int): The number of convolution layers.

        layers_kernel_size (int): The kernel size of the convolution layers.

        layers_channels_size (int): The number of output channels of each convolution layer.

        pre_conv_stride (int): The stride of the prenet convolution layer.

        pre_conv_kernel_size (int): The kernel size of the prenet convolution layer.

        post_conv_channels_size (int): The number of output channels of the
        postnet convolution layer.

        post_conv_kernel_size (int): The kernel size of the postnet convolution layer.

        p_dropout (float): The dropout rate.

        wav_kernel_size (Optional[int]): The kernel size of the first layer that
        processes the wav samples directly if wav is modeled. Default None.

        wav_stride (Optional[int]): The stride size of the first layer that
        processes the wav samples directly if wav is modeled. Default None.
    """

    def __init__(
        self,
        in_features: int,
        n_classes: int,
        n_conv_layers: int,
        layers_kernel_size: int,
        layers_channels_size: int,
        pre_conv_stride: int,
        pre_conv_kernel_size: int,
        post_conv_channels_size: int,
        post_conv_kernel_size: int,
        p_dropout: float,
        wav_kernel_size: Optional[int] = None,
        wav_stride: Optional[int] = None,
    ) -> None:
        super().__init__(1, 1)
        self.encoder = Wav2LetterEncoder(
            in_features=in_features,
            n_conv_layers=n_conv_layers,
            layers_kernel_size=layers_kernel_size,
            layers_channels_size=layers_channels_size,
            pre_conv_stride=pre_conv_stride,
            pre_conv_kernel_size=pre_conv_kernel_size,
            post_conv_channels_size=post_conv_channels_size,
            post_conv_kernel_size=post_conv_kernel_size,
            p_dropout=p_dropout,
            wav_kernel_size=wav_kernel_size,
            wav_stride=wav_stride,
        )
        self.pred_net = ConvPredModule(
            in_features=post_conv_channels_size,
            n_classes=n_classes,
            activation=nn.LogSoftmax(dim=-1),
        )


[docs]class QuartzNet(CTCModel):
    """Implements QuartzNet model architecture proposed
    in https://arxiv.org/abs/1910.10261

    Args:

        n_classes (int): The number of classes.

        in_features (int): The input/speech feature size.

        num_blocks (int): The number of QuartzNet blocks (denoted as 'B' in the paper).

        block_repetition (int): The number of times to repeat each block (denoted as 'S' in the paper).

        num_sub_blocks (int): The number of QuartzNet subblocks, (denoted as 'R' in the paper).

        channels_size (List[int]): A list of integers representing the number of output channels
        for each block.

        epilog_kernel_size (int): The kernel size of the convolution layer in the epilog block.

        epilog_channel_size (Tuple[int, int]): A tuple for both epilog layers
        of the convolution layer .

        prelog_kernel_size (int): The kernel size pf the convolution layer in the prelog block.

        prelog_stride (int): The stride size of the of the convoltuional layer
        in the prelog block.

        prelog_n_channels (int): The number of output channels of the convolutional
        layer in the prelog block.

        groups (int): The groups size.

        blocks_kernel_size (Union[int, List[int]]): An integer or a list of integers representing the
        kernel size(s) for each block's convolutional layer.

        p_dropout (float): The dropout rate.
    """

    def __init__(
        self,
        n_classes: int,
        in_features: int,
        num_blocks: int,
        block_repetition: int,
        num_sub_blocks: int,
        channels_size: List[int],
        epilog_kernel_size: int,
        epilog_channel_size: Tuple[int, int],
        prelog_kernel_size: int,
        prelog_stride: int,
        prelog_n_channels: int,
        groups: int,
        blocks_kernel_size: Union[int, List[int]],
        p_dropout: float,
    ) -> None:
        super().__init__(1, 1)
        self.encoder = QuartzNetEncoder(
            in_features=in_features,
            num_blocks=num_blocks,
            block_repetition=block_repetition,
            num_sub_blocks=num_sub_blocks,
            channels_size=channels_size,
            epilog_kernel_size=epilog_kernel_size,
            epilog_channel_size=epilog_channel_size,
            prelog_kernel_size=prelog_kernel_size,
            prelog_stride=prelog_stride,
            prelog_n_channels=prelog_n_channels,
            groups=groups,
            blocks_kernel_size=blocks_kernel_size,
            p_dropout=p_dropout,
        )
        self.pred_net = ConvPredModule(
            in_features=epilog_channel_size[1],
            n_classes=n_classes,
            activation=nn.LogSoftmax(dim=-1),
        )


[docs]class Squeezeformer(CTCModel):
    """Implements the Squeezeformer model architecture
    as described in https://arxiv.org/abs/2206.00888

    Args:

        n_classes (int): The number of classes.

        in_features (int): The input/speech feature size.

        n (int): The number of layers per block, (denoted as N in the paper).

        d_model (int): The model dimension.

        ff_expansion_factor (int): The expansion factor of linear layer in the
        feed forward module.

        h (int): The number of attention heads.

        kernel_size (int): The kernel size of the depth-wise convolution layer.

        pooling_kernel_size (int): The kernel size of the pooling convolution layer.

        pooling_stride (int): The stride size of the pooling convolution layer.

        ss_kernel_size (Union[int, List[int]]): The kernel size of the subsampling layer(s).

        ss_stride (Union[int, List[int]]): The stride of the subsampling layer(s).

        ss_n_conv_layers (int): The number of subsampling convolutional layers.

        p_dropout (float): The dropout rate.

        ss_groups (Union[int, List[int]]): The subsampling convolution groups size(s).

        masking_value (int): The masking value. Default -1e15
    """

    def __init__(
        self,
        n_classes: int,
        in_features: int,
        n: int,
        d_model: int,
        ff_expansion_factor: int,
        h: int,
        kernel_size: int,
        pooling_kernel_size: int,
        pooling_stride: int,
        ss_kernel_size: Union[int, List[int]],
        ss_stride: Union[int, List[int]],
        ss_n_conv_layers: int,
        p_dropout: float,
        ss_groups: Union[int, List[int]] = 1,
        masking_value: int = -1e15,
    ) -> None:
        super().__init__(pred_in_size=d_model, n_classes=n_classes)
        self.encoder = SqueezeformerEncoder(
            in_features=in_features,
            n=n,
            d_model=d_model,
            ff_expansion_factor=ff_expansion_factor,
            h=h,
            kernel_size=kernel_size,
            pooling_kernel_size=pooling_kernel_size,
            pooling_stride=pooling_stride,
            ss_kernel_size=ss_kernel_size,
            ss_stride=ss_stride,
            ss_n_conv_layers=ss_n_conv_layers,
            p_dropout=p_dropout,
            ss_groups=ss_groups,
            masking_value=masking_value,
        )