"""This module contains various CTC (Connectionist Temporal Classification) models for speech recognition. The CTC models are implemented as subclasses of the base class CTCModel.
Classes:
- CTCModel(nn.Module): Base class for CTC models.
- DeepSpeechV1(CTCModel): DeepSpeech version 1 model.
- BERT(nn.Module): Bidirectional Encoder Representations from Transformers (BERT) model.
- DeepSpeechV2(CTCModel): DeepSpeech version 2 model.
- Conformer(CTCModel): Conformer model.
- Jasper(CTCModel): Jasper model.
- Wav2Letter(CTCModel): Wav2Letter model.
- QuartzNet(CTCModel): QuartzNet model.
- Squeezeformer(CTCModel): Squeezeformer model.
"""
from typing import List, Optional, Tuple, Union
import torch
from torch import Tensor, nn
from .encoders import (
ConformerEncoder,
DeepSpeechV1Encoder,
DeepSpeechV2Encoder,
JasperEncoder,
QuartzNetEncoder,
SqueezeformerEncoder,
Wav2LetterEncoder,
)
from .layers import ConvPredModule, PredModule, TransformerEncLayer
[docs]class CTCModel(nn.Module):
"""Builds the base of CTC model, if used encoder paramters has to be added,
otherwise the forward module will raise error.
"""
def __new__(cls, *args, **kwargs):
if cls is CTCModel:
raise NotImplementedError(f"Cannot create object of type `{cls.__name__}`")
return object.__new__(cls)
def __init__(self, pred_in_size: int, n_classes: int) -> None:
super().__init__()
self.pred_net = PredModule(
in_features=pred_in_size,
n_classes=n_classes,
activation=nn.LogSoftmax(dim=-1),
)
[docs] def forward(self, x: Tensor, mask: Tensor, *args, **kwargs):
"""passes the speech input to the model.
Args:
x (Tensor): The input speech signal of shape [B, M, d]
mask (Tensor): The speech mask of shape [B, M], where it's false
for the positions that contains padding.
Returns:
Tuple[Tensor, Tensor]: A tuple where the first is the predictions of shape
[M, B, C], and the lengths tensor of shape [B].
"""
out, lengths = self.encoder(x, mask, *args, **kwargs) # B, M, d
preds = self.pred_net(out) # B, M, C
preds = preds.permute(1, 0, 2) # M, B, C
return preds, lengths
[docs]class DeepSpeechV1(CTCModel):
"""Builds the DeepSpeech model described in
https://arxiv.org/abs/1412.5567
Args:
in_features (int): The input feature size.
hidden_size (int): The hidden size of the rnn layers.
n_linear_layers (int): The number of feed-forward layers.
bidirectional (bool): A flag indicating if the rnn is bidirectional or not.
n_clases (int): The number of classes to predict.
max_clip_value (int): The maximum relu clipping value.
rnn_type (str): The RNN type it has to be one of rnn, gru or lstm.
p_dropout (float): The dropout rate.
"""
def __init__(
self,
in_features: int,
hidden_size: int,
n_linear_layers: int,
bidirectional: bool,
n_classes: int,
max_clip_value: int,
rnn_type: str,
p_dropout: float,
) -> None:
super().__init__(pred_in_size=hidden_size, n_classes=n_classes)
self.encoder = DeepSpeechV1Encoder(
in_features=in_features,
hidden_size=hidden_size,
n_linear_layers=n_linear_layers,
bidirectional=bidirectional,
max_clip_value=max_clip_value,
rnn_type=rnn_type,
p_dropout=p_dropout,
)
[docs] @torch.no_grad()
def predict(self, x: Tensor) -> Tensor:
# x of shape [1, T, F]
mask = torch.ones(1, x.shape[1]).long()
preds, _ = self(x, mask)
return preds
[docs]class BERT(nn.Module):
"""Implements the BERT Model as
described in https://arxiv.org/abs/1810.04805
Args:
max_len (int): The maximum length for positional encoding.
in_features (int): The input/speech feature size.
d_model (int): The model dimensionality.
h (int): The number of attention heads.
ff_size (int): The inner size of the feed forward module.
n_layers (int): The number of transformer encoders.
n_classes (int): The number of classes.
p_dropout (float): The dropout rate.
"""
def __init__(
self,
max_len: int,
in_features: int,
d_model: int,
h: int,
ff_size: int,
n_layers: int,
n_classes: int,
p_dropout: float,
) -> None:
super().__init__()
self.fc = nn.Linear(
in_features=in_features,
out_features=d_model,
)
self.pos_emb = nn.Parameter(torch.randn(max_len, d_model))
self.layers = nn.ModuleList(
[
TransformerEncLayer(d_model=d_model, ff_size=ff_size, h=h)
for _ in range(n_layers)
]
)
self.pred_module = PredModule(
in_features=d_model, n_classes=n_classes, activation=nn.LogSoftmax(dim=-1)
)
self.dropout = nn.Dropout(p_dropout)
[docs] def embed(self, x: Tensor, mask: Tensor):
max_len = mask.sum(dim=-1).max().item()
emb = self.pos_emb[:max_len] # M, d
emb = emb.unsqueeze(dim=0) # 1, M, d
emb = emb.repeat(mask.shape[0], 1, 1) # B, M , d
mask = mask.unsqueeze(dim=-1) # B, M, 1
emb = mask * emb
return emb + x
[docs] def forward(self, x: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
# mask of shape [B, M] and True if there's no padding
# x of shape [B, T, F]
lengths = mask.sum(dim=-1)
out = self.fc(x)
out = self.embed(out, mask)
for layer in self.layers:
out = layer(out, mask)
out = self.dropout(out)
preds = self.pred_module(out)
preds = preds.permute(1, 0, 2)
return preds, lengths
[docs]class DeepSpeechV2(CTCModel):
"""Implements the deep speech model
proposed in https://arxiv.org/abs/1512.02595
Args:
n_conv (int): The number of convolution layers.
kernel_size (int): The kernel size of the convolution layers.
stride (int): The stride size of the convolution layer.
in_features (int): The input/speech feature size.
hidden_size (int): The hidden size of the RNN layers.
bidirectional (bool): A flag indicating if the rnn is bidirectional or not.
n_rnn (int): The number of RNN layers.
n_linear_layers (int): The number of linear layers.
n_classes (int): The number of classes.
max_clip_value (int): The maximum relu clipping value.
rnn_type (str): The RNN type it has to be one of rnn, gru or lstm.
tau (int): The future context size.
p_dropout (float): The dropout rate.
"""
def __init__(
self,
n_conv: int,
kernel_size: int,
stride: int,
in_features: int,
hidden_size: int,
bidirectional: bool,
n_rnn: int,
n_linear_layers: int,
n_classes: int,
max_clip_value: int,
rnn_type: str,
tau: int,
p_dropout: float,
) -> None:
super().__init__(pred_in_size=hidden_size, n_classes=n_classes)
self.encoder = DeepSpeechV2Encoder(
n_conv=n_conv,
kernel_size=kernel_size,
stride=stride,
in_features=in_features,
hidden_size=hidden_size,
bidirectional=bidirectional,
n_rnn=n_rnn,
n_linear_layers=n_linear_layers,
max_clip_value=max_clip_value,
rnn_type=rnn_type,
tau=tau,
p_dropout=p_dropout,
)
[docs]class Jasper(CTCModel):
"""Implements Jasper model architecture proposed
in https://arxiv.org/abs/1904.03288
Args:
n_classes (int): The number of classes.
in_features (int): The input/speech feature size.
num_blocks (int): The number of Jasper blocks (denoted as 'B' in the paper).
num_sub_blocks (int): The number of Jasper subblocks (denoted as 'R' in the paper).
channel_inc (int): The rate to increase the number of channels across the blocks.
epilog_kernel_size (int): The kernel size of the epilog block convolution layer.
prelog_kernel_size (int): The kernel size of the prelog block ocnvolution layer.
prelog_stride (int): The stride size of the prelog block convolution layer.
prelog_n_channels (int): The output channnels of the prelog block convolution layer.
blocks_kernel_size (Union[int, List[int]]): The kernel size(s) of the convolution layer for each block.
p_dropout (float): The dropout rate.
"""
def __init__(
self,
n_classes: int,
in_features: int,
num_blocks: int,
num_sub_blocks: int,
channel_inc: int,
epilog_kernel_size: int,
prelog_kernel_size: int,
prelog_stride: int,
prelog_n_channels: int,
blocks_kernel_size: Union[int, List[int]],
p_dropout: float,
) -> None:
super().__init__(1, 1)
# TODO: Add activation function options
# TODO: Add normalization options
# TODO: Add residual connections options
# TODO: passing dropout list
self.encoder = JasperEncoder(
in_features=in_features,
num_blocks=num_blocks,
num_sub_blocks=num_sub_blocks,
channel_inc=channel_inc,
epilog_kernel_size=epilog_kernel_size,
prelog_kernel_size=prelog_kernel_size,
prelog_stride=prelog_stride,
prelog_n_channels=prelog_n_channels,
blocks_kernel_size=blocks_kernel_size,
p_dropout=p_dropout,
)
self.pred_net = ConvPredModule(
in_features=prelog_n_channels + channel_inc * (2 + num_blocks),
n_classes=n_classes,
activation=nn.LogSoftmax(dim=-1),
)
[docs]class Wav2Letter(CTCModel):
"""Implements Wav2Letter model proposed in
https://arxiv.org/abs/1609.03193
Args:
in_features (int): The input/speech feature size.
n_classes (int): The number of classes.
n_conv_layers (int): The number of convolution layers.
layers_kernel_size (int): The kernel size of the convolution layers.
layers_channels_size (int): The number of output channels of each convolution layer.
pre_conv_stride (int): The stride of the prenet convolution layer.
pre_conv_kernel_size (int): The kernel size of the prenet convolution layer.
post_conv_channels_size (int): The number of output channels of the
postnet convolution layer.
post_conv_kernel_size (int): The kernel size of the postnet convolution layer.
p_dropout (float): The dropout rate.
wav_kernel_size (Optional[int]): The kernel size of the first layer that
processes the wav samples directly if wav is modeled. Default None.
wav_stride (Optional[int]): The stride size of the first layer that
processes the wav samples directly if wav is modeled. Default None.
"""
def __init__(
self,
in_features: int,
n_classes: int,
n_conv_layers: int,
layers_kernel_size: int,
layers_channels_size: int,
pre_conv_stride: int,
pre_conv_kernel_size: int,
post_conv_channels_size: int,
post_conv_kernel_size: int,
p_dropout: float,
wav_kernel_size: Optional[int] = None,
wav_stride: Optional[int] = None,
) -> None:
super().__init__(1, 1)
self.encoder = Wav2LetterEncoder(
in_features=in_features,
n_conv_layers=n_conv_layers,
layers_kernel_size=layers_kernel_size,
layers_channels_size=layers_channels_size,
pre_conv_stride=pre_conv_stride,
pre_conv_kernel_size=pre_conv_kernel_size,
post_conv_channels_size=post_conv_channels_size,
post_conv_kernel_size=post_conv_kernel_size,
p_dropout=p_dropout,
wav_kernel_size=wav_kernel_size,
wav_stride=wav_stride,
)
self.pred_net = ConvPredModule(
in_features=post_conv_channels_size,
n_classes=n_classes,
activation=nn.LogSoftmax(dim=-1),
)
[docs]class QuartzNet(CTCModel):
"""Implements QuartzNet model architecture proposed
in https://arxiv.org/abs/1910.10261
Args:
n_classes (int): The number of classes.
in_features (int): The input/speech feature size.
num_blocks (int): The number of QuartzNet blocks (denoted as 'B' in the paper).
block_repetition (int): The number of times to repeat each block (denoted as 'S' in the paper).
num_sub_blocks (int): The number of QuartzNet subblocks, (denoted as 'R' in the paper).
channels_size (List[int]): A list of integers representing the number of output channels
for each block.
epilog_kernel_size (int): The kernel size of the convolution layer in the epilog block.
epilog_channel_size (Tuple[int, int]): A tuple for both epilog layers
of the convolution layer .
prelog_kernel_size (int): The kernel size pf the convolution layer in the prelog block.
prelog_stride (int): The stride size of the of the convoltuional layer
in the prelog block.
prelog_n_channels (int): The number of output channels of the convolutional
layer in the prelog block.
groups (int): The groups size.
blocks_kernel_size (Union[int, List[int]]): An integer or a list of integers representing the
kernel size(s) for each block's convolutional layer.
p_dropout (float): The dropout rate.
"""
def __init__(
self,
n_classes: int,
in_features: int,
num_blocks: int,
block_repetition: int,
num_sub_blocks: int,
channels_size: List[int],
epilog_kernel_size: int,
epilog_channel_size: Tuple[int, int],
prelog_kernel_size: int,
prelog_stride: int,
prelog_n_channels: int,
groups: int,
blocks_kernel_size: Union[int, List[int]],
p_dropout: float,
) -> None:
super().__init__(1, 1)
self.encoder = QuartzNetEncoder(
in_features=in_features,
num_blocks=num_blocks,
block_repetition=block_repetition,
num_sub_blocks=num_sub_blocks,
channels_size=channels_size,
epilog_kernel_size=epilog_kernel_size,
epilog_channel_size=epilog_channel_size,
prelog_kernel_size=prelog_kernel_size,
prelog_stride=prelog_stride,
prelog_n_channels=prelog_n_channels,
groups=groups,
blocks_kernel_size=blocks_kernel_size,
p_dropout=p_dropout,
)
self.pred_net = ConvPredModule(
in_features=epilog_channel_size[1],
n_classes=n_classes,
activation=nn.LogSoftmax(dim=-1),
)