"""This file contains templates for various pre-implemented models. Each template is a model configuration for a specific pre-implemented model in the framework.
Classes:
- BaseTemplate: Base template that defines common configuration parameters for all models.
- DeepSpeechV1Temp: Template for configuring DeepSpeechV1 model.
- BERTTemp: Template for configuring BERT model.
- DeepSpeechV2Temp: Template for configuring DeepSpeechV2 model.
- ConformerCTCTemp: Template for configuring Conformer CTC model.
- JasperTemp: Template for configuring Jasper model.
- Wav2LetterTemp: Template for configuring Wav2Letter model.
- LASTemp: Template for configuring LAS model.
- BasicAttSeq2SeqRNNTemp: Template for configuring Basic Attention Seq2Seq RNN model.
- RNNWithLocationAwareAttTemp: Template for configuring RNN with Location-Aware Attention model.
- SpeechTransformerTemp: Template for configuring Speech Transformer model.
- QuartzNetTemp: Template for configuring QuartzNet model.
- SqueezeformerCTCTemp: Template for configuring Squeezeformer CTC model.
- RNNTTemp: Template for configuring RNNT model.
- ConformerTransducerTemp: Template for configuring Conformer Transducer model.
- ContextNetTemp: Template for configuring ContextNet model.
- VGGTransformerTransducerTemp: Template for configuring the VGG transformer with truncated self attention model.
- TransformerTransducerTemp: Template for configuring the transformer-transducer with truncated relative self attention model.
Builder:
The below templates can be used to build custome model:
- CTCModelBuilderTemp: Template for building CTC models.
- TransducerBuilderTemp: Template for building Transducer models.
- Seq2SeqBuilderTemp: Template for building Seq2Seq models.
"""
from dataclasses import asdict, dataclass
from typing import List, Optional, Tuple, Union
from torch.nn import Module
from speeq.constants import CTC_TYPE, MODEL_BUILDER_TYPE, SEQ2SEQ_TYPE, TRANSDUCER_TYPE
from speeq.interfaces import ITemplate
[docs]class BaseTemplate(ITemplate):
[docs] def get_dict(self):
return asdict(self)
@property
def name(self):
return self._name
@property
def type(self):
return self._type
[docs]@dataclass
class DeepSpeechV1Temp(BaseTemplate):
"""DeepSpeech 1 model template
https://arxiv.org/abs/1412.5567
Args:
in_features (int): The input feature size.
hidden_size (int): The hidden size of the rnn layers.
n_linear_layers (int): The number of feed-forward layers.
bidirectional (bool): A flag indicating if the rnn is bidirectional or not.
max_clip_value (int): The maximum relu clipping value.
p_dropout (float): The dropout rate.
rnn_type (str): The RNN type it has to be one of rnn, gru or lstm.
"""
in_features: int
hidden_size: int
n_linear_layers: int
bidirectional: bool
max_clip_value: int
p_dropout: float
rnn_type: str = "rnn"
_name = "deep_speech_v1"
_type = CTC_TYPE
[docs]@dataclass
class BERTTemp(BaseTemplate):
"""BERT model template
https://arxiv.org/abs/1810.04805
Args:
max_len (int): The maximum length for positional encoding.
in_features (int): The input/speech feature size.
d_model (int): The model dimensionality.
h (int): The number of attention heads.
ff_size (int): The inner size of the feed forward module.
n_layers (int): The number of transformer encoders.
p_dropout (float): The dropout rate.
"""
max_len: int
in_features: int
d_model: int
h: int
ff_size: int
n_layers: int
p_dropout: float
_name = "bert"
_type = CTC_TYPE
[docs]@dataclass
class DeepSpeechV2Temp(BaseTemplate):
"""deep speech 2 model template
https://arxiv.org/abs/1512.02595
Args:
n_conv (int): The number of convolution layers.
kernel_size (int): The kernel size of the convolution layers.
stride (int): The stride size of the convolution layer.
in_features (int): The input/speech feature size.
hidden_size (int): The hidden size of the RNN layers.
bidirectional (bool): A flag indicating if the rnn is bidirectional or not.
n_rnn (int): The number of RNN layers.
n_linear_layers (int): The number of linear layers.
max_clip_value (int): The maximum relu clipping value.
tau (int): The future context size.
p_dropout (float): The dropout rate.
rnn_type (str): The RNN type it has to be one of rnn, gru or lstm.
"""
n_conv: int
kernel_size: int
stride: int
in_features: int
hidden_size: int
bidirectional: bool
n_rnn: int
n_linear_layers: int
max_clip_value: int
tau: int
p_dropout: float
rnn_type: str = "rnn"
_name = "deep_speech_v2"
_type = CTC_TYPE
[docs]@dataclass
class JasperTemp(BaseTemplate):
"""Jasper model template
https://arxiv.org/abs/1904.03288
Args:
in_features (int): The input/speech feature size.
num_blocks (int): The number of Jasper blocks (denoted as 'B' in the paper).
num_sub_blocks (int): The number of Jasper subblocks (denoted as 'R' in the paper).
channel_inc (int): The rate to increase the number of channels across the blocks.
epilog_kernel_size (int): The kernel size of the epilog block convolution layer.
prelog_kernel_size (int): The kernel size of the prelog block ocnvolution layer.
prelog_stride (int): The stride size of the prelog block convolution layer.
prelog_n_channels (int): The output channnels of the prelog block convolution layer.
blocks_kernel_size (Union[int, List[int]]): The kernel size(s) of the convolution layer for each block.
p_dropout (float): The dropout rate.
"""
in_features: int
num_blocks: int
num_sub_blocks: int
channel_inc: int
epilog_kernel_size: int
prelog_kernel_size: int
prelog_stride: int
prelog_n_channels: int
blocks_kernel_size: Union[int, List[int]]
p_dropout: float
_name = "jasper"
_type = CTC_TYPE
[docs]@dataclass
class Wav2LetterTemp(BaseTemplate):
"""Wav2Letter model template
https://arxiv.org/abs/1609.03193
Args:
in_features (int): The input/speech feature size.
n_conv_layers (int): The number of convolution layers.
layers_kernel_size (int): The kernel size of the convolution layers.
layers_channels_size (int): The number of output channels of each convolution layer.
pre_conv_stride (int): The stride of the prenet convolution layer.
pre_conv_kernel_size (int): The kernel size of the prenet convolution layer.
post_conv_channels_size (int): The number of output channels of the
postnet convolution layer.
post_conv_kernel_size (int): The kernel size of the postnet convolution layer.
p_dropout (float): The dropout rate.
wav_kernel_size (Optional[int]): The kernel size of the first layer that
processes the wav samples directly if wav is modeled. Default None.
wav_stride (Optional[int]): The stride size of the first layer that
processes the wav samples directly if wav is modeled. Default None.
"""
in_features: int
n_conv_layers: int
layers_kernel_size: int
layers_channels_size: int
pre_conv_stride: int
pre_conv_kernel_size: int
post_conv_channels_size: int
post_conv_kernel_size: int
p_dropout: float
wav_kernel_size: Optional[int] = None
wav_stride: Optional[int] = None
_name = "wav2letter"
_type = CTC_TYPE
[docs]@dataclass
class LASTemp(BaseTemplate):
"""Listen, Attend and Spell model template
https://arxiv.org/abs/1508.01211
Args:
in_features (int): The encoder's input feature speech size.
hidden_size (int): The hidden size of the RNN layers.
enc_num_layers (int): The number of layers in the encoder.
reduction_factor (int): The time resolution reduction factor.
bidirectional (bool): A flag indicating if the rnn is bidirectional or not.
dec_num_layers (int): The number of the RNN layers in the decoder.
emb_dim (int): The embedding size.
p_dropout (float): The dropout rate.
pred_activation (Module): An instance of an activation function to be
applied on the last dimension of the predicted logits..
teacher_forcing_rate (float): The teacher forcing rate. Default 0.0
rnn_type (str): The RNN type it has to be one of rnn, gru or lstm.
Default 'rnn'.
"""
in_features: int
hidden_size: int
enc_num_layers: int
reduction_factor: int
bidirectional: bool
dec_num_layers: int
emb_dim: int
p_dropout: float
pred_activation: Module
teacher_forcing_rate: float = 0.0
rnn_type: str = "rnn"
_name = "las"
_type = SEQ2SEQ_TYPE
[docs]@dataclass
class BasicAttSeq2SeqRNNTemp(BaseTemplate):
"""Basic RNN encoder decoder model template.
Args:
in_features (int): The encoder's input feature speech size.
hidden_size (int): The hidden size of the RNN layers.
enc_num_layers (int): The number of layers in the encoder.
bidirectional (bool): A flag indicating if the rnn is bidirectional or not.
dec_num_layers (int): The number of the RNN layers in the decoder.
emb_dim (int): The embedding size.
p_dropout (float): The dropout rate.
pred_activation (Module): An instance of an activation function.
teacher_forcing_rate (float): The teacher forcing rate. Default 0.0
rnn_type (str): The RNN type it has to be one of rnn, gru or lstm.
Default 'rnn'.
"""
in_features: int
hidden_size: int
enc_num_layers: int
bidirectional: bool
dec_num_layers: int
emb_dim: int
p_dropout: float
pred_activation: Module
teacher_forcing_rate: float = 0.0
rnn_type: str = "rnn"
_name = "basic_att_rnn"
_type = SEQ2SEQ_TYPE
[docs]@dataclass
class RNNWithLocationAwareAttTemp(BaseTemplate):
"""RNN seq2seq with location aware attention model tempalte
in https://arxiv.org/abs/1506.07503
Args:
in_features (int): The encoder's input feature speech size.
hidden_size (int): The hidden size of the RNN layers.
enc_num_layers (int): The number of layers in the encoder.
bidirectional (bool): A flag indicating if the rnn is bidirectional or not.
dec_num_layers (int): The number of the RNN layers in the decoder.
emb_dim (int): The embedding size.
kernel_size (int): The attention kernel size.
activation (str): The activation function to use in the attention layer.
it can be either softmax or sigmax.
p_dropout (float): The dropout rate.
pred_activation (Module): An instance of an activation function to be
applied on the last dimension of the predicted logits..
inv_temperature (Union[float, int]): The inverse temperature value. Default 1.
teacher_forcing_rate (float): The teacher forcing rate. Default 0.0
rnn_type (str): The RNN type it has to be one of rnn, gru or lstm.
Default 'rnn'.
"""
in_features: int
hidden_size: int
enc_num_layers: int
bidirectional: bool
dec_num_layers: int
emb_dim: int
kernel_size: int
activation: str
p_dropout: float
pred_activation: Module
inv_temperature: Union[float, int] = 1
teacher_forcing_rate: float = 0.0
rnn_type: str = "rnn"
_name = "rnn_with_location_att"
_type = SEQ2SEQ_TYPE
[docs]@dataclass
class QuartzNetTemp(BaseTemplate):
"""QuartzNet model template
https://arxiv.org/abs/1910.10261
Args:
in_features (int): The input/speech feature size.
num_blocks (int): The number of QuartzNet blocks (denoted as 'B' in the paper).
block_repetition (int): The number of times to repeat each block (denoted as 'S' in the paper).
num_sub_blocks (int): The number of QuartzNet subblocks, (denoted as 'R' in the paper).
channels_size (List[int]): A list of integers representing the number of output channels
for each block.
epilog_kernel_size (int): The kernel size of the convolution layer in the epilog block.
epilog_channel_size (Tuple[int, int]): A tuple for both epilog layers
of the convolution layer .
prelog_kernel_size (int): The kernel size pf the convolution layer in the prelog block.
prelog_stride (int): The stride size of the of the convoltuional layer
in the prelog block.
prelog_n_channels (int): The number of output channels of the convolutional
layer in the prelog block.
groups (int): The groups size.
blocks_kernel_size (Union[int, List[int]]): An integer or a list of integers representing the
kernel size(s) for each block's convolutional layer.
p_dropout (float): The dropout rate.
"""
in_features: int
num_blocks: int
block_repetition: int
num_sub_blocks: int
channels_size: List[int]
epilog_kernel_size: int
epilog_channel_size: Tuple[int, int]
prelog_kernel_size: int
prelog_stride: int
prelog_n_channels: int
groups: int
blocks_kernel_size: Union[int, List[int]]
p_dropout: float
_name = "quartz_net"
_type = CTC_TYPE
[docs]@dataclass
class RNNTTemp(BaseTemplate):
"""RNN transducer model template
https://arxiv.org/abs/1211.3711
Args:
in_features (int): The input feature size.
emb_dim (int): The embedding layer's size.
n_layers (int): The number of the RNN layers in the encoder.
n_dec_layers (int): The number of RNNs in the decoder (predictor).
hidden_size (int): The hidden size of the RNN layers.
bidirectional (bool): A flag indicating if the rnn is bidirectional or not.
rnn_type (str): The RNN type.
p_dropout (float): The dropout rate.
"""
in_features: int
emb_dim: int
n_layers: int
n_dec_layers: int
hidden_size: int
bidirectional: bool
rnn_type: str
p_dropout: float
_name = "rnn-t"
_type = TRANSDUCER_TYPE
[docs]@dataclass
class ContextNetTemp(BaseTemplate):
"""ContextNet transducer model template
https://arxiv.org/abs/2005.03191
Args:
in_features (int): The input feature size.
emb_dim (int): The embedding layer's size.
n_layers (int): The number of ContextNet blocks.
n_dec_layers (int): The number of RNNs in the decoder (predictor).
n_sub_layers (Union[int, List[int]]): The number of convolutional
layers per block. If list is passed, it has to be of length equal to `n_layers`.
stride (Union[int, List[int]]): The stride of the last convolutional
layers per block. If list is passed, it has to be of length equal to
`n_layers`.
out_channels (Union[int, List[int]]): The channels size of the
convolutional layers per block. If list is passed, it has to be of
length equal to `n_layers`.
kernel_size (int): The convolutional layers kernel size.
reduction_factor (int): The feature reduction size of the Squeeze-and-excitation module.
rnn_type (str): The RNN type it has to be one of rnn, gru or lstm.
"""
in_features: int
emb_dim: int
n_layers: int
n_dec_layers: int
n_sub_layers: Union[int, List[int]]
stride: Union[int, List[int]]
out_channels: Union[int, List[int]]
kernel_size: int
reduction_factor: int
rnn_type: str
_name = "context_net"
_type = TRANSDUCER_TYPE
[docs]@dataclass
class CTCModelBuilderTemp(BaseTemplate):
"""CTC-based model builder template
Args:
encoder (Module): The speech encoder (acoustic model), such that
the forward of the encoder returns a tuple of the encoded speech
tensor and a length tensor of the encoded speech.
pred_net (Union[Module, None]): The prediction network. if provided
the forward of the prediction network expected to have log softmax
as an activation function, and the predictions of shape [T, B, C]
where T is the sequence length, B the batch size, and C the number
of classes. Default None.
feat_size (Union[Module, None]): Used if pred_net parameter is not None
where it's the encoder's output feature size. Default None.
"""
encoder: Module
pred_net: Union[Module, None] = None
feat_size: Union[int, None] = None
_name = CTC_TYPE
_type = MODEL_BUILDER_TYPE
[docs]@dataclass
class TransducerBuilderTemp(BaseTemplate):
"""Transducer-based model builder template
Args:
encoder (Module): The speech encoder (acoustic model), such that
the forward method of the encoder returns a tuple of the encoded
speech tensor and a length tensor for the encoded speech.
decoder (Module): The text decoder such that
the forward method of the decoder returns a tuple of the encoded
text tensor and a length tensor for the encoded text.
join_net (Union[Module, None]): The join network. if provided
the forward of the join network expected to have no activation
function, and the results of shape [B, Ts, Tt, C], where B the
batch size, Ts is the speech sequence length, Tt is the text
sequence length, and C the number of classes. Default None.
feat_size (Union[Module, None]): Used if join_net parameter is not None
where it's the encoder and the decoder's output feature size.
Default None.
"""
encoder: Module
decoder: Module
join_net: Union[Module, None] = None
feat_size: Union[None, int] = None
_name = TRANSDUCER_TYPE
_type = MODEL_BUILDER_TYPE
[docs]@dataclass
class Seq2SeqBuilderTemp(BaseTemplate):
"""Seq2Seq-based model builder template
Args:
encoder (Module): The speech encoder (acoustic model), such that
the forward method of the encoder returns a tuple of the encoded
speech tensor, the last encoder hidden state tensor/tuple if there
is any, and a length tensor for the encoded speech.
decoder (Module): The text decoder such that
the forward method of the decoder takes the encoder's output, the
last encoder's hidden state (if there is any), the encoder mask,
the decoder input, and the decoder mask and returns the prediction
tensor.
"""
encoder: Module
decoder: Module
_name = SEQ2SEQ_TYPE
_type = MODEL_BUILDER_TYPE