Source code for speeq.models.templates

"""This file contains templates for various pre-implemented models. Each template is a model configuration for a specific pre-implemented model in the framework.

Classes:

- BaseTemplate: Base template that defines common configuration parameters for all models.
- DeepSpeechV1Temp: Template for configuring DeepSpeechV1 model.
- BERTTemp: Template for configuring BERT model.
- DeepSpeechV2Temp: Template for configuring DeepSpeechV2 model.
- ConformerCTCTemp: Template for configuring Conformer CTC model.
- JasperTemp: Template for configuring Jasper model.
- Wav2LetterTemp: Template for configuring Wav2Letter model.
- LASTemp: Template for configuring LAS model.
- BasicAttSeq2SeqRNNTemp: Template for configuring Basic Attention Seq2Seq RNN model.
- RNNWithLocationAwareAttTemp: Template for configuring RNN with Location-Aware Attention model.
- SpeechTransformerTemp: Template for configuring Speech Transformer model.
- QuartzNetTemp: Template for configuring QuartzNet model.
- SqueezeformerCTCTemp: Template for configuring Squeezeformer CTC model.
- RNNTTemp: Template for configuring RNNT model.
- ConformerTransducerTemp: Template for configuring Conformer Transducer model.
- ContextNetTemp: Template for configuring ContextNet model.
- VGGTransformerTransducerTemp: Template for configuring the VGG transformer with truncated self attention model.
- TransformerTransducerTemp: Template for configuring the transformer-transducer with truncated relative self attention model.

Builder:

The below templates can be used to build custome model:

- CTCModelBuilderTemp: Template for building CTC models.
- TransducerBuilderTemp: Template for building Transducer models.
- Seq2SeqBuilderTemp: Template for building Seq2Seq models.



"""
from dataclasses import asdict, dataclass
from typing import List, Optional, Tuple, Union

from torch.nn import Module

from speeq.constants import CTC_TYPE, MODEL_BUILDER_TYPE, SEQ2SEQ_TYPE, TRANSDUCER_TYPE
from speeq.interfaces import ITemplate


[docs]class BaseTemplate(ITemplate):
[docs] def get_dict(self): return asdict(self)
@property def name(self): return self._name @property def type(self): return self._type
[docs]@dataclass class DeepSpeechV1Temp(BaseTemplate): """DeepSpeech 1 model template https://arxiv.org/abs/1412.5567 Args: in_features (int): The input feature size. hidden_size (int): The hidden size of the rnn layers. n_linear_layers (int): The number of feed-forward layers. bidirectional (bool): A flag indicating if the rnn is bidirectional or not. max_clip_value (int): The maximum relu clipping value. p_dropout (float): The dropout rate. rnn_type (str): The RNN type it has to be one of rnn, gru or lstm. """ in_features: int hidden_size: int n_linear_layers: int bidirectional: bool max_clip_value: int p_dropout: float rnn_type: str = "rnn" _name = "deep_speech_v1" _type = CTC_TYPE
[docs]@dataclass class BERTTemp(BaseTemplate): """BERT model template https://arxiv.org/abs/1810.04805 Args: max_len (int): The maximum length for positional encoding. in_features (int): The input/speech feature size. d_model (int): The model dimensionality. h (int): The number of attention heads. ff_size (int): The inner size of the feed forward module. n_layers (int): The number of transformer encoders. p_dropout (float): The dropout rate. """ max_len: int in_features: int d_model: int h: int ff_size: int n_layers: int p_dropout: float _name = "bert" _type = CTC_TYPE
[docs]@dataclass class DeepSpeechV2Temp(BaseTemplate): """deep speech 2 model template https://arxiv.org/abs/1512.02595 Args: n_conv (int): The number of convolution layers. kernel_size (int): The kernel size of the convolution layers. stride (int): The stride size of the convolution layer. in_features (int): The input/speech feature size. hidden_size (int): The hidden size of the RNN layers. bidirectional (bool): A flag indicating if the rnn is bidirectional or not. n_rnn (int): The number of RNN layers. n_linear_layers (int): The number of linear layers. max_clip_value (int): The maximum relu clipping value. tau (int): The future context size. p_dropout (float): The dropout rate. rnn_type (str): The RNN type it has to be one of rnn, gru or lstm. """ n_conv: int kernel_size: int stride: int in_features: int hidden_size: int bidirectional: bool n_rnn: int n_linear_layers: int max_clip_value: int tau: int p_dropout: float rnn_type: str = "rnn" _name = "deep_speech_v2" _type = CTC_TYPE
[docs]@dataclass class ConformerCTCTemp(BaseTemplate): """ConformerCTC model template https://arxiv.org/abs/2005.08100 Args: d_model (int): The model dimension. n_conf_layers (int): The number of conformer blocks. ff_expansion_factor (int): The feed-forward expansion factor. h (int): The number of attention heads. kernel_size (int): The convolution module kernel size. ss_kernel_size (int): The subsampling layer kernel size. ss_stride (int): The subsampling layer stride size. ss_num_conv_layers (int): The number of subsampling convolutional layers. in_features (int): The input/speech feature size. res_scaling (float): The residual connection multiplier. p_dropout (float): The dropout rate. """ d_model: int n_conf_layers: int ff_expansion_factor: int h: int kernel_size: int ss_kernel_size: int ss_stride: int ss_num_conv_layers: int in_features: int res_scaling: float p_dropout: float _name = "conformer" _type = CTC_TYPE
[docs]@dataclass class JasperTemp(BaseTemplate): """Jasper model template https://arxiv.org/abs/1904.03288 Args: in_features (int): The input/speech feature size. num_blocks (int): The number of Jasper blocks (denoted as 'B' in the paper). num_sub_blocks (int): The number of Jasper subblocks (denoted as 'R' in the paper). channel_inc (int): The rate to increase the number of channels across the blocks. epilog_kernel_size (int): The kernel size of the epilog block convolution layer. prelog_kernel_size (int): The kernel size of the prelog block ocnvolution layer. prelog_stride (int): The stride size of the prelog block convolution layer. prelog_n_channels (int): The output channnels of the prelog block convolution layer. blocks_kernel_size (Union[int, List[int]]): The kernel size(s) of the convolution layer for each block. p_dropout (float): The dropout rate. """ in_features: int num_blocks: int num_sub_blocks: int channel_inc: int epilog_kernel_size: int prelog_kernel_size: int prelog_stride: int prelog_n_channels: int blocks_kernel_size: Union[int, List[int]] p_dropout: float _name = "jasper" _type = CTC_TYPE
[docs]@dataclass class Wav2LetterTemp(BaseTemplate): """Wav2Letter model template https://arxiv.org/abs/1609.03193 Args: in_features (int): The input/speech feature size. n_conv_layers (int): The number of convolution layers. layers_kernel_size (int): The kernel size of the convolution layers. layers_channels_size (int): The number of output channels of each convolution layer. pre_conv_stride (int): The stride of the prenet convolution layer. pre_conv_kernel_size (int): The kernel size of the prenet convolution layer. post_conv_channels_size (int): The number of output channels of the postnet convolution layer. post_conv_kernel_size (int): The kernel size of the postnet convolution layer. p_dropout (float): The dropout rate. wav_kernel_size (Optional[int]): The kernel size of the first layer that processes the wav samples directly if wav is modeled. Default None. wav_stride (Optional[int]): The stride size of the first layer that processes the wav samples directly if wav is modeled. Default None. """ in_features: int n_conv_layers: int layers_kernel_size: int layers_channels_size: int pre_conv_stride: int pre_conv_kernel_size: int post_conv_channels_size: int post_conv_kernel_size: int p_dropout: float wav_kernel_size: Optional[int] = None wav_stride: Optional[int] = None _name = "wav2letter" _type = CTC_TYPE
[docs]@dataclass class LASTemp(BaseTemplate): """Listen, Attend and Spell model template https://arxiv.org/abs/1508.01211 Args: in_features (int): The encoder's input feature speech size. hidden_size (int): The hidden size of the RNN layers. enc_num_layers (int): The number of layers in the encoder. reduction_factor (int): The time resolution reduction factor. bidirectional (bool): A flag indicating if the rnn is bidirectional or not. dec_num_layers (int): The number of the RNN layers in the decoder. emb_dim (int): The embedding size. p_dropout (float): The dropout rate. pred_activation (Module): An instance of an activation function to be applied on the last dimension of the predicted logits.. teacher_forcing_rate (float): The teacher forcing rate. Default 0.0 rnn_type (str): The RNN type it has to be one of rnn, gru or lstm. Default 'rnn'. """ in_features: int hidden_size: int enc_num_layers: int reduction_factor: int bidirectional: bool dec_num_layers: int emb_dim: int p_dropout: float pred_activation: Module teacher_forcing_rate: float = 0.0 rnn_type: str = "rnn" _name = "las" _type = SEQ2SEQ_TYPE
[docs]@dataclass class BasicAttSeq2SeqRNNTemp(BaseTemplate): """Basic RNN encoder decoder model template. Args: in_features (int): The encoder's input feature speech size. hidden_size (int): The hidden size of the RNN layers. enc_num_layers (int): The number of layers in the encoder. bidirectional (bool): A flag indicating if the rnn is bidirectional or not. dec_num_layers (int): The number of the RNN layers in the decoder. emb_dim (int): The embedding size. p_dropout (float): The dropout rate. pred_activation (Module): An instance of an activation function. teacher_forcing_rate (float): The teacher forcing rate. Default 0.0 rnn_type (str): The RNN type it has to be one of rnn, gru or lstm. Default 'rnn'. """ in_features: int hidden_size: int enc_num_layers: int bidirectional: bool dec_num_layers: int emb_dim: int p_dropout: float pred_activation: Module teacher_forcing_rate: float = 0.0 rnn_type: str = "rnn" _name = "basic_att_rnn" _type = SEQ2SEQ_TYPE
[docs]@dataclass class RNNWithLocationAwareAttTemp(BaseTemplate): """RNN seq2seq with location aware attention model tempalte in https://arxiv.org/abs/1506.07503 Args: in_features (int): The encoder's input feature speech size. hidden_size (int): The hidden size of the RNN layers. enc_num_layers (int): The number of layers in the encoder. bidirectional (bool): A flag indicating if the rnn is bidirectional or not. dec_num_layers (int): The number of the RNN layers in the decoder. emb_dim (int): The embedding size. kernel_size (int): The attention kernel size. activation (str): The activation function to use in the attention layer. it can be either softmax or sigmax. p_dropout (float): The dropout rate. pred_activation (Module): An instance of an activation function to be applied on the last dimension of the predicted logits.. inv_temperature (Union[float, int]): The inverse temperature value. Default 1. teacher_forcing_rate (float): The teacher forcing rate. Default 0.0 rnn_type (str): The RNN type it has to be one of rnn, gru or lstm. Default 'rnn'. """ in_features: int hidden_size: int enc_num_layers: int bidirectional: bool dec_num_layers: int emb_dim: int kernel_size: int activation: str p_dropout: float pred_activation: Module inv_temperature: Union[float, int] = 1 teacher_forcing_rate: float = 0.0 rnn_type: str = "rnn" _name = "rnn_with_location_att" _type = SEQ2SEQ_TYPE
[docs]@dataclass class SpeechTransformerTemp(BaseTemplate): """Speech Transformer model template https://ieeexplore.ieee.org/document/8462506 Args: in_features (int): The input/speech feature size. n_conv_layers (int): The number of down-sampling convolutional layers. kernel_size (int): The kernel size of the down-sampling convolutional layers. stride (int): The stride size of the down-sampling convolutional layers. d_model (int): The model dimensionality. n_enc_layers (int): The number of encoder layers. n_dec_layers (int): The number of decoder layers. ff_size (int): The dimensionality of the inner layer of the feed-forward module. h (int): The number of attention heads. att_kernel_size (int): The kernel size of the attentional convolutional layers. att_out_channels (int): The number of output channels of the attentional convolution layers. pred_activation (Module): An activation function instance to be applied on the last dimension of the predicted logits. masking_value (int): The attentin masking value. Default -1e15 """ in_features: int n_conv_layers: int kernel_size: int stride: int d_model: int n_enc_layers: int n_dec_layers: int ff_size: int h: int att_kernel_size: int att_out_channels: int pred_activation: Module masking_value: int = -1e15 _name = "speech_transformer" _type = SEQ2SEQ_TYPE
[docs]@dataclass class QuartzNetTemp(BaseTemplate): """QuartzNet model template https://arxiv.org/abs/1910.10261 Args: in_features (int): The input/speech feature size. num_blocks (int): The number of QuartzNet blocks (denoted as 'B' in the paper). block_repetition (int): The number of times to repeat each block (denoted as 'S' in the paper). num_sub_blocks (int): The number of QuartzNet subblocks, (denoted as 'R' in the paper). channels_size (List[int]): A list of integers representing the number of output channels for each block. epilog_kernel_size (int): The kernel size of the convolution layer in the epilog block. epilog_channel_size (Tuple[int, int]): A tuple for both epilog layers of the convolution layer . prelog_kernel_size (int): The kernel size pf the convolution layer in the prelog block. prelog_stride (int): The stride size of the of the convoltuional layer in the prelog block. prelog_n_channels (int): The number of output channels of the convolutional layer in the prelog block. groups (int): The groups size. blocks_kernel_size (Union[int, List[int]]): An integer or a list of integers representing the kernel size(s) for each block's convolutional layer. p_dropout (float): The dropout rate. """ in_features: int num_blocks: int block_repetition: int num_sub_blocks: int channels_size: List[int] epilog_kernel_size: int epilog_channel_size: Tuple[int, int] prelog_kernel_size: int prelog_stride: int prelog_n_channels: int groups: int blocks_kernel_size: Union[int, List[int]] p_dropout: float _name = "quartz_net" _type = CTC_TYPE
[docs]@dataclass class SqueezeformerCTCTemp(BaseTemplate): """Squeezeformer model template https://arxiv.org/abs/2206.00888 Args: in_features (int): The input/speech feature size. n (int): The number of layers per block, (denoted as N in the paper). d_model (int): The model dimension. ff_expansion_factor (int): The expansion factor of linear layer in the feed forward module. h (int): The number of attention heads. kernel_size (int): The kernel size of the depth-wise convolution layer. pooling_kernel_size (int): The kernel size of the pooling convolution layer. pooling_stride (int): The stride size of the pooling convolution layer. ss_kernel_size (Union[int, List[int]]): The kernel size of the subsampling layer(s). ss_stride (Union[int, List[int]]): The stride of the subsampling layer(s). ss_n_conv_layers (int): The number of subsampling convolutional layers. p_dropout (float): The dropout rate. ss_groups (Union[int, List[int]]): The subsampling convolution groups size(s). masking_value (int): The masking value. Default -1e15 """ in_features: int n: int d_model: int ff_expansion_factor: int h: int kernel_size: int pooling_kernel_size: int pooling_stride: int ss_kernel_size: Union[int, List[int]] ss_stride: Union[int, List[int]] ss_n_conv_layers: int p_dropout: float ss_groups: Union[int, List[int]] = 1 masking_value: int = -1e15 _name = "squeezeformer" _type = CTC_TYPE
[docs]@dataclass class RNNTTemp(BaseTemplate): """RNN transducer model template https://arxiv.org/abs/1211.3711 Args: in_features (int): The input feature size. emb_dim (int): The embedding layer's size. n_layers (int): The number of the RNN layers in the encoder. n_dec_layers (int): The number of RNNs in the decoder (predictor). hidden_size (int): The hidden size of the RNN layers. bidirectional (bool): A flag indicating if the rnn is bidirectional or not. rnn_type (str): The RNN type. p_dropout (float): The dropout rate. """ in_features: int emb_dim: int n_layers: int n_dec_layers: int hidden_size: int bidirectional: bool rnn_type: str p_dropout: float _name = "rnn-t" _type = TRANSDUCER_TYPE
[docs]@dataclass class ConformerTransducerTemp(BaseTemplate): """Conformer transducer model template https://arxiv.org/abs/2005.08100 Args: d_model (int): The model dimension. n_conf_layers (int): The number of conformer blocks. n_dec_layers (int): The number of RNNs in the decoder (predictor). ff_expansion_factor (int): The feed-forward expansion factor. h (int): The number of attention heads. kernel_size (int): The convolution module kernel size. ss_kernel_size (int): The subsampling layer kernel size. ss_stride (int): The subsampling layer stride size. ss_num_conv_layers (int): The number of subsampling convolutional layers. in_features (int): The input/speech feature size. res_scaling (float): The residual connection multiplier. emb_dim (int): The embedding layer's size. rnn_type (str): The RNN type it has to be one of rnn, gru or lstm. p_dropout (float): The dropout rate. """ d_model: int n_conf_layers: int n_dec_layers: int ff_expansion_factor: int h: int kernel_size: int ss_kernel_size: int ss_stride: int ss_num_conv_layers: int in_features: int res_scaling: float emb_dim: int rnn_type: str p_dropout: float _name = "conformer" _type = TRANSDUCER_TYPE
[docs]@dataclass class ContextNetTemp(BaseTemplate): """ContextNet transducer model template https://arxiv.org/abs/2005.03191 Args: in_features (int): The input feature size. emb_dim (int): The embedding layer's size. n_layers (int): The number of ContextNet blocks. n_dec_layers (int): The number of RNNs in the decoder (predictor). n_sub_layers (Union[int, List[int]]): The number of convolutional layers per block. If list is passed, it has to be of length equal to `n_layers`. stride (Union[int, List[int]]): The stride of the last convolutional layers per block. If list is passed, it has to be of length equal to `n_layers`. out_channels (Union[int, List[int]]): The channels size of the convolutional layers per block. If list is passed, it has to be of length equal to `n_layers`. kernel_size (int): The convolutional layers kernel size. reduction_factor (int): The feature reduction size of the Squeeze-and-excitation module. rnn_type (str): The RNN type it has to be one of rnn, gru or lstm. """ in_features: int emb_dim: int n_layers: int n_dec_layers: int n_sub_layers: Union[int, List[int]] stride: Union[int, List[int]] out_channels: Union[int, List[int]] kernel_size: int reduction_factor: int rnn_type: str _name = "context_net" _type = TRANSDUCER_TYPE
[docs]@dataclass class CTCModelBuilderTemp(BaseTemplate): """CTC-based model builder template Args: encoder (Module): The speech encoder (acoustic model), such that the forward of the encoder returns a tuple of the encoded speech tensor and a length tensor of the encoded speech. pred_net (Union[Module, None]): The prediction network. if provided the forward of the prediction network expected to have log softmax as an activation function, and the predictions of shape [T, B, C] where T is the sequence length, B the batch size, and C the number of classes. Default None. feat_size (Union[Module, None]): Used if pred_net parameter is not None where it's the encoder's output feature size. Default None. """ encoder: Module pred_net: Union[Module, None] = None feat_size: Union[int, None] = None _name = CTC_TYPE _type = MODEL_BUILDER_TYPE
[docs]@dataclass class TransducerBuilderTemp(BaseTemplate): """Transducer-based model builder template Args: encoder (Module): The speech encoder (acoustic model), such that the forward method of the encoder returns a tuple of the encoded speech tensor and a length tensor for the encoded speech. decoder (Module): The text decoder such that the forward method of the decoder returns a tuple of the encoded text tensor and a length tensor for the encoded text. join_net (Union[Module, None]): The join network. if provided the forward of the join network expected to have no activation function, and the results of shape [B, Ts, Tt, C], where B the batch size, Ts is the speech sequence length, Tt is the text sequence length, and C the number of classes. Default None. feat_size (Union[Module, None]): Used if join_net parameter is not None where it's the encoder and the decoder's output feature size. Default None. """ encoder: Module decoder: Module join_net: Union[Module, None] = None feat_size: Union[None, int] = None _name = TRANSDUCER_TYPE _type = MODEL_BUILDER_TYPE
[docs]@dataclass class Seq2SeqBuilderTemp(BaseTemplate): """Seq2Seq-based model builder template Args: encoder (Module): The speech encoder (acoustic model), such that the forward method of the encoder returns a tuple of the encoded speech tensor, the last encoder hidden state tensor/tuple if there is any, and a length tensor for the encoded speech. decoder (Module): The text decoder such that the forward method of the decoder takes the encoder's output, the last encoder's hidden state (if there is any), the encoder mask, the decoder input, and the decoder mask and returns the prediction tensor. """ encoder: Module decoder: Module _name = SEQ2SEQ_TYPE _type = MODEL_BUILDER_TYPE
[docs]@dataclass class VGGTransformerTransducerTemp(BaseTemplate): """VGG Transformer transducer model template https://arxiv.org/abs/1910.12977 Args: in_features (int): The input feature size. emb_dim (int): The embedding layer's size. n_layers (int): The number of transformer encoder layers with truncated self attention. n_dec_layers (int): The number of RNNs in the decoder (predictor). rnn_type (str): The RNN type. n_vgg_blocks (int): The number of VGG blocks to use. n_conv_layers_per_vgg_block (List[int]): A list of integers that specifies the number of convolution layers in each block. kernel_sizes_per_vgg_block (List[List[int]]): A list of lists that contains the kernel size for each layer in each block. The length of the outer list should match `n_vgg_blocks`, and each inner list should be the same length as the corresponding block's number of layers. n_channels_per_vgg_block (List[List[int]]): A list of lists that contains the number of channels for each convolution layer in each block. This argument should also have length equal to `n_vgg_blocks`, and each sublist should have length equal to the number of layers in the corresponding block. vgg_pooling_kernel_size (List[int]): A list of integers that specifies the size of the max pooling layer in each block. The length of this list should be equal to `n_vgg_blocks`. d_model (int): The model dimensionality. ff_size (int): The feed forward inner layer dimensionality. h (int): The number of heads in the attention mechanism. joint_size (int): The joint layer feature size (denoted as do in the paper). left_size (int): The size of the left window that each time step is allowed to look at. right_size (int): The size of the right window that each time step is allowed to look at. p_dropout (float): The dropout rate. masking_value (float, optional): The value to use for masking padded elements. Defaults to -1e15. """ in_features: int emb_dim: int n_layers: int n_dec_layers: int rnn_type: str n_vgg_blocks: int n_conv_layers_per_vgg_block: List[int] kernel_sizes_per_vgg_block: List[List[int]] n_channels_per_vgg_block: List[List[int]] vgg_pooling_kernel_size: List[int] d_model: int ff_size: int h: int joint_size: int left_size: int right_size: int p_dropout: float masking_value: int = -1e15 _name = "vgg_transformer" _type = TRANSDUCER_TYPE
[docs]@dataclass class TransformerTransducerTemp(BaseTemplate): """Transformer-Transducer model template https://arxiv.org/abs/2002.02562 Args: in_features (int): The input feature size. n_layers (int): The number of transformer encoder layers with truncated self attention. n_dec_layers (int): The number of layers in the decoder (predictor). d_model (int): The model dimensionality. ff_size (int): The feed forward inner layer dimensionality. h (int): The number of heads in the attention mechanism. joint_size (int): The joint layer feature size. enc_left_size (int): The size of the left window that each time step is allowed to look at in the encoder. enc_right_size (int): The size of the right window that each time step is allowed to look at in the encoder. dec_left_size (int): The size of the left window that each time step is allowed to look at in the decoder. dec_right_size (int): The size of the right window that each time step is allowed to look at in the decoder. p_dropout (float): The dropout rate. stride (int): The stride of the convolution layer in the prenet. Default 1. kernel_size (int): The kernel size of the convolution layer in the prenet. Default 1. masking_value (float, optional): The value to use for masking padded elements. Defaults to -1e15. """ in_features: int n_layers: int n_dec_layers: int d_model: int ff_size: int h: int joint_size: int enc_left_size: int enc_right_size: int dec_left_size: int dec_right_size: int p_dropout: float stride: int = 1 kernel_size: int = 1 masking_value: int = -1e15 _name = "transformer_transducer" _type = TRANSDUCER_TYPE