Source code for speeq.data.augmenters

"""
This module offers a variety of data augmentation techniques that operate on
either the time or frequency domain. All of the augmenters implemented in this
module have an abstract method named run, which is either an instance of
StochasticProcess or IProcess. This method applies the augmentation to the
input signal.

The classes implemented in this module can be divided into two groups based
on the domain they operate on.

The time domain augmenters include:

- WhiteNoiseInjector: Adds white noise to the input signal.
- VolumeChanger: Changes the volume of the input signal by applying a random gain.
- ConsistentAttenuator: Attenuates the amplitude of the input signal by a random single value.
- VariableAttenuator: Attenuates the amplitude of the input signal by applying a random gain, where the gain varies across time steps.
- Reverberation: Adds a reverberation effect to the input signal.

The frequency domain augmenters are:

- FrequencyMasking: Masks a random frequency pins in the input spectrogram.
- TimeMasking: Masks a random time segment in the input spectrogram.


.. code-block:: python

    # Import the module
    import torch
    from speeq.data import augmenters

    # creating dummy signal
    signal = torch.randn(1, 100)

    # Create an instance of the augmenter
    # Will use WhiteNoiseInjector example for illustration
    noise_injector = augmenters.WhiteNoiseInjector()

    # Apply the augmentation to the signal
    augmented_signal = noise_injector.run(signal)

"""
import random

import torch
from torch import Tensor

from .processes import StochasticProcess


[docs]class WhiteNoiseInjector(StochasticProcess):
    """Injects random Gaussian noise to the original signal,
    this is done by adding the inpus signal x to randomly generated
    Gaussian noise multiplied by a random gain as the below equation
    shows:

        .. math::

            x_{augmented} = x + noise \cdot gain \cdot gain\_mul

    where `gain` is a random number between 0 and 1, and x is a signal in the time domain.

    Args:
        ratio (float): The ratio/rate that the augmentation will be applied to
        the data. Default 1.0

        gain_mul (float): The gain multiplier factor to control the strength of
        the noise. Default 0.05
    """

    def __init__(self, ratio=1.0, gain_mul=5e-2) -> None:
        super().__init__(ratio)
        self.gain_mul = gain_mul

[docs]    def func(self, x: Tensor) -> Tensor:
        gain = random.random() * self.gain_mul
        return x + gain * torch.randn_like(x).to(x.device)


[docs]class VolumeChanger(StochasticProcess):
    """Amplifies the input signal by a random gain.

    This changes the amplitude of the input time domain signal `x` by
    multiplying it with a random gain `gain`, which is computed using the
    following equation:

    .. math::

        gain = (max\_gain - min\_gain) \cdot U + min\_gain

    where `U` is a random number between 0 and 1. The resulting amplified signal
    `x_augmented` can be computed as follows:

    .. math::

        x_{augmented} = x \cdot gain

    Args:
        ratio (float): The ratio/rate that the augmentation will be applied to
        the data. Default 1.0

        min_gain (float): The minimum gain that will be multiplied by the signal.

        max_gain (float): The maximum gain that will be multiplied by the signal.
    """

    def __init__(self, min_gain: float, max_gain: float, ratio=1.0) -> None:
        super().__init__(ratio)
        self.min_gain = min_gain
        self.max_gain = max_gain
        self._diff = self.max_gain - self.min_gain

    @property
    def _gain(self):
        return self._diff * random.random() + self.min_gain

[docs]    def func(self, x: Tensor) -> Tensor:
        return self._gain * x


[docs]class ConsistentAttenuator(VolumeChanger):
    """applies amplitude attenuation to the input signal by multiplying it by a
    random gain that is less than 1, such that the gain is consistent across all time
    steps. The augmented signal x_augmented is given by the following equation:

        .. math::

            x_{augmented} = x \cdot U

    where `U` is a random number between `min_gain` and 1, and x is input time
    domain signal.

    Args:
        ratio (float): The ratio/rate that the augmentation will be applied to
        the data. Default 1.0

        min_gain (float): The minimum gain that will be multiplied by the
        signal. Default 0.1
    """

    def __init__(self, ratio=1.0, min_gain=0.1) -> None:
        super().__init__(ratio=ratio, min_gain=min_gain, max_gain=1)


[docs]class VariableAttenuator(StochasticProcess):
    """applies random attenuation to an input signal by multiplying it with a random
    gain less than 1. The amount of attenuation varies across time steps. The
    function uses the following equation to apply the attenuation

        .. math::

            x_{augmented} = x \cdot U \cdot noise\_mul

    where x is the input time-domain signal, U is a random Gaussian noise with
    values between 0 and 1 and the same shape as x.

    Args:
        ratio (float): The ratio/rate that the augmentation will be applied to
        the data. Default 1.0

        noise_mul (float): The noise multiplier. Default 0.5
    """

    def __init__(self, ratio=1.0, noise_mul=0.5) -> None:
        super().__init__(ratio)
        self.noise_mul = noise_mul

[docs]    def func(self, x: Tensor):
        return x + x * self.noise_mul * torch.randn_like(x).to(x.device)


[docs]class Reverberation(StochasticProcess):
    """Reverberates the input signal by generating an impulse response
    and convolve it with the speech signal.

    Args:
        ratio (float): The ratio/rate that the augmentation will be applied to
        the data. Default 1.0

        min_len (int): The minimum impulse response to generate. Default 1000.

        max_len (int): The maximum impulse response length. Default 4000.

        start_val (int): The starting value of the impulse response genration
        function. Default -10.

        end_val (int): The end value of the impulse response genration
        function. Default 10.

        eps (float): smoothing value, to prevent devision by 0. Default to 1e-3.
    """

    def __init__(
        self, ratio=1.0, min_len=1000, max_len=4000, start_val=-10, end_val=10, eps=1e-3
    ) -> None:
        super().__init__(ratio)
        self.min_len = min_len
        self.max_len = max_len
        self.start_val = start_val
        self.end_val = end_val
        self.eps = eps

    def _get_impulse_response(self) -> Tensor:
        length = random.randint(self.min_len, self.max_len)
        x = torch.linspace(self.start_val, self.end_val, length)
        alpha = self.eps + random.random()
        x /= alpha
        denominator = torch.exp(x) + torch.exp(-x)
        numerator = torch.exp(x) - torch.exp(-x)
        envelope = 1 - (numerator / denominator) ** 2
        envelope = envelope.nan_to_num()
        h = torch.randn_like(envelope) * envelope
        return h.view(1, 1, length)

[docs]    def func(self, x: Tensor):
        if x.dim() == 2:
            x = x.unsqueeze(dim=0)
        ir = self._get_impulse_response()
        ir = ir.to(x.device)
        ir = ir.flip(dims=[-1])
        ir_length = ir.shape[-1]
        is_odd = int(ir_length % 2 != 0)
        x = torch.cat(
            [
                torch.zeros(1, 1, ir_length // 2).to(x.device),
                x,
                torch.zeros(1, 1, ir_length // 2 + is_odd).to(x.device),
            ],
            dim=-1,
        )
        return torch.nn.functional.conv1d(x, ir).squeeze(dim=0)


class _BaseMasking(StochasticProcess):
    def __init__(self, n: int, max_length: int, ratio=1.0) -> None:
        super().__init__(ratio)
        self.n = n
        self.max_length = max_length

    def _get_mask(self, x: Tensor, dim=-1):
        mask = torch.ones_like(x, device=x.device)
        length = x.shape[dim]
        for _ in range(self.n):
            start = random.randint(0, length)
            end = random.randint(start, start + self.max_length)
            end = min(length, end)
            indices = torch.arange(start, end, device=x.device)
            mask = mask.index_fill(dim=dim, index=indices, value=0)
        return mask


[docs]class FrequencyMasking(_BaseMasking):
    """Mask the inpus spectrogram, on the frequency axis.

    Args:
        n (int): The number of times to apply the masking operation.

        max_length (int): The maximum masking length.

        ratio (float): The ratio/rate that the augmentation will be applied to
        the data. Default 1.0
    """

    def __init__(self, n: int, max_length: int, ratio=1.0) -> None:
        super().__init__(ratio=ratio, n=n, max_length=max_length)

[docs]    def func(self, x: Tensor) -> Tensor:
        """
        x (Tensor): the input spectrogram to be augmented of
        shape [..., time, freq].
        """
        return x * self._get_mask(x, dim=-1)


[docs]class TimeMasking(_BaseMasking):
    """Mask the inpus spectrogram, on the time axis.

    Args:
        n (int): The number of times to apply the masking operation.

        max_length (int): The maximum masking length.

        ratio (float): The ratio/rate that the augmentation will be applied to
        the data. Default 1.0
    """

    def __init__(self, n: int, max_length: int, ratio=1.0) -> None:
        super().__init__(ratio=ratio, n=n, max_length=max_length)

[docs]    def func(self, x: Tensor) -> Tensor:
        """
        x (Tensor): the input spectrogram to be augmented of
        shape [..., time, freq].
        """
        return x * self._get_mask(x, dim=-2)