Source code for speeq.data.processes

"""
This module contains classes for speech processing that implement the IProcess interface.

Classes:

- AudioLoader: Loads and resamples an audio file to the targeted sample rate.
- FeatExtractor: Extracts frequency features from a given time domain signal, supporting mfcc and mel spectrogram.
- FeatStacker: Implements feature stacking operation by stacking consecutive time stamps along the feature space.
- FrameContextualizer: Implements frame contextualizer through time as described in https://arxiv.org/abs/1412.5567

All classes have a run method as an abstract method that applies the process on the input signal.

Example usage:


    .. code-block:: python

        # Import required packages and modules
        import torch
        from speeq.data.processes import AudioLoader, FeatExtractor, FeatStacker, FrameContextualizer

        # Define the audio file path
        audio_path = 'path/to/audio.wav'

        # Create an instance of AudioLoader
        audio_loader = AudioLoader(sample_rate=16000)

        # Load the audio file using AudioLoader
        audio_tensor = audio_loader.run(audio_path)

        # Create an instance of FeatExtractor
        feat_extractor = FeatExtractor(feat_ext_name='mfcc', feat_ext_args={'n_mfcc': 13})

        # Extract the MFCC features of the audio tensor using FeatExtractor
        feat_tensor = feat_extractor.run(audio_tensor)

        # Create an instance of FeatStacker
        feat_stacker = FeatStacker(feat_stack_factor=2)

        # Stack the features using FeatStacker
        stacked_feat_tensor = feat_stacker.run(feat_tensor)

        # Create an instance of FrameContextualizer
        frame_contextualizer = FrameContextualizer(contex_size=2)

        # Add context to the features using FrameContextualizer
        contextualized_feat_tensor = frame_contextualizer.run(stacked_feat_tensor)
"""

import functools
import random
from abc import abstractmethod
from pathlib import Path
from typing import Union

import torch
import torchaudio
from torch import Tensor, nn
from torchaudio import transforms

from speeq.interfaces import IProcess

SAMPLER_CACHE_SIZE = 5


[docs]class StochasticProcess(IProcess): """An inteerface that applies the process functionality based on the ratio provided Args: ratio (float): The rate of applying the process on the input. """ def __init__(self, ratio: float) -> None: super().__init__() self.ratio = ratio @property def _shall_do(self) -> bool: return random.random() <= self.ratio
[docs] @abstractmethod def func(): pass
[docs] def run(self, x): if self._shall_do: return self.func(x) return x
[docs]class AudioLoader(IProcess): """Loads and resamples audio to the specified sample rate. .. note:: This class utilizes the `load` function provided by `torchaudio` framework for loading audio. For additional details on supported file formats and further information, please refer to `the documentation <https://pytorch.org/audio/stable/index.html>`_. Args: sample_rate (int): The target sampling rate. """ def __init__(self, sample_rate: int) -> None: super().__init__() self.sample_rate = sample_rate @functools.lru_cache(SAMPLER_CACHE_SIZE) def _get_resampler(self, original_sr: int): return transforms.Resample(orig_freq=original_sr, new_freq=self.sample_rate)
[docs] def run(self, file_path: Union[Path, str]) -> Tensor: """Load and resample an audio file. Args: file_path (Union[Path, str]): The path to the audio file to be loaded. Returns: Tensor: A tensor containing the speech data of shape [C, M]. """ x, sr = torchaudio.load(file_path) return self._get_resampler(sr)(x)
[docs]class FeatExtractor(IProcess): """A class for extracting frequency features from a given time domain signal, supporting `mfcc` and `mel spectrogram` features. .. note:: This class utilizes the `transforms.MelSpectrogram` and `transforms.MFCC` classes provided by `torchaudio` framework for feature extraction. For additional details and parameter information, please refer to `the documentation <https://pytorch.org/audio/stable/index.html>`_. Args: feat_ext_name (str): The name of the feature extractor to be used. either `mfcc` or `melspec`. feat_ext_args (dict): The arguments to be passed to the specified feature extractor. For more information on parameters, please refer to the `torchaudio` documentation. """ __feat_extractor = {"mfcc": transforms.MFCC, "melspec": transforms.MelSpectrogram} def __init__( self, feat_ext_name: str, feat_ext_args: dict, ) -> None: super().__init__() self.feat_extractor = self.__feat_extractor[feat_ext_name](**feat_ext_args)
[docs] def run(self, x: Tensor) -> Tensor: """Transforms the input signal `x` from time domain to frequency domain using the predefined feature extractor. Args: x (Tensor): A time domain tensor of shape [..., T, F]. Returns: Tensor: A tensor containing the frequency domain features of shape [..., T, F]. """ x = self.feat_extractor(x) x = x.swapaxes(-1, -2) # (..., T, F) return x
[docs]class FeatStacker(IProcess): """A class that implements feature stacking by stacking `n` consecutive time stamps along the feature space. Args: feat_stack_factor (int): The factor by which to stack the features. Example: .. code-block:: python # Import required packages import torch from speeq.data.processes import FeatStacker batch_size = 3 max_len = 10 feat_size = 15 stacking_factor = 2 # creating dummy data input = torch.randn(batch_size, max_len, feat_size) # Create an instance of the class stacker = FeatStacker(feat_stack_factor=stacking_factor) # Apply the process to the input result = stacker.run(input) # Print the result's shape print(result.shape) # torch.Size([3, 5, 30]) """ def __init__(self, feat_stack_factor: int) -> None: super().__init__() assert feat_stack_factor > 1 self.feat_stack_factor = feat_stack_factor
[docs] def run(self, x: Tensor): """Applies feature stacking to the input tensor x by stacking `n` consecutive time frames along the feature space. Args: x (Tensor): The input tensor of shape [..., T, F] Returns: Tensor: The result tensor after applying feature stacking. The shape of the result tensor is [batch_size, seq_len // n, feat_dim * n]. """ if self.feat_stack_factor == 1: return x residual = x.shape[-2] % self.feat_stack_factor if residual != 0: size = list(x.shape) size[-2] = self.feat_stack_factor - residual zeros = torch.zeros(*size).to(x.device) x = torch.cat([x, zeros], dim=-2) x = x.view(*x.shape[:-2], x.shape[-2] // self.feat_stack_factor, -1) return x
[docs]class FrameContextualizer(IProcess): """Implements frame contextualization through time, as described in https://arxiv.org/abs/1412.5567 Args: contex_size (int): The context size, i.e., the number of left or right frames to consider with the current frame. Example: .. code-block:: python # Import required packages import torch from speeq.data.processes import FrameContextualizer max_len = 10 feat_size = 15 # 2 to the left, the current time step and 2 to the right contex_size = 2 # creating dummy data input = torch.randn(1, max_len, feat_size) # Create an instance of the class contextualizer = FrameContextualizer(contex_size=contex_size) # Apply the process to the input result = contextualizer.run(input) # Print the result's shape print(result.shape) # torch.Size([1, 10, 75]) """ def __init__(self, contex_size: int) -> None: super().__init__() self.contex_size = contex_size self.win_size = self.contex_size * 2 + 1 self.conv = nn.Conv1d( in_channels=1, out_channels=self.win_size, kernel_size=self.win_size, bias=False, ) self.conv.weight.data = torch.eye(self.win_size).view( self.win_size, 1, self.win_size ) self.conv.weight.requires_grad = False
[docs] def run(self, x: Tensor) -> Tensor: """Applies frame contextualization on the input tensor x. Args: x (Tensor): The input tensor of shape [1, M, F] Returns: Tensor: The output tensor of shape [1, M, F * (2 * context_size + 1)] """ x = x.permute(2, 0, 1) # [F, 1, T] zeros = torch.zeros(x.shape[0], 1, self.contex_size) x = torch.cat([zeros, x, zeros], dim=-1) x = self.conv(x) # [F, W, T] x = x.permute(2, 1, 0).contiguous() # [T, W, F] x = x.view(1, x.shape[0], -1) # [1, T, W * F] return x