Source code for libdse.data.features

"""Feature extraction utilities for log-mel power spectrograms.

This module provides the abstract :class:`BaseExtractor` interface and the
concrete :class:`MelPowerSpectrumExtractor` implementation used to build
training samples for the denoising autoencoder (DAE).

The feature pipeline converts raw mono waveforms into fixed-width log-mel
power spectrogram vectors following the approach described in:

    Lu, X., Tsao, Y., Matsuda, S., & Hori, C. (2013). *Speech enhancement
    based on deep denoising autoencoder*. INTERSPEECH 2013.

Pipeline summary
----------------
1. Compute the short-time Fourier transform (STFT) with a Hann window.
2. Project the magnitude-squared spectrum onto a mel filterbank.
3. Divide the resulting mel spectrogram into non-overlapping temporal windows
   of *chunks_per_feature* frames; discard incomplete trailing windows.
4. Flatten each window into a 1-D vector of length
   ``n_mels * chunks_per_feature``.

When a :class:`~libdse.data.noise.DEMANDNoiseDataset` is supplied to
:class:`MelPowerSpectrumExtractor`, a noisy copy of the waveform is
synthesised on the fly by :func:`~libdse.data.noise.add_noise_snr`.  The
returned training pair is then ``(noisy_feature, clean_feature)`` instead of
``(clean_feature, clean_feature)``.

Classes
-------
- :class:`BaseExtractor` — Abstract base; subclass to define custom extractors.
- :class:`MelPowerSpectrumExtractor` — Log-mel power spectrum extractor.
- :class:`MagnitudePowerSpectrumExtractor` — Raw magnitude power spectrum extractor.

Typical usage
-------------
.. code-block:: python

    from pathlib import Path
    from libdse.data.features import MelPowerSpectrumExtractor
    from libdse.data.noise import DEMANDNoiseDataset, DEMANDNoiseType

    noise_ds = DEMANDNoiseDataset(
        entry_point=Path("data/noise/DEMAND"),
        noise_types=DEMANDNoiseType.ALL,
    )
    extractor = MelPowerSpectrumExtractor(
        sampling_rate=16_000,
        window_length=512,
        hop_length=128,
        n_mels=40,
        chunks_per_feature=7,
        noise=noise_ds,
    )
    # Called once per utterance inside a DataLoader worker — yields one pair
    # per non-overlapping spectrogram window:
    for noisy_feat, clean_feat in extractor(waveform):
        ...
"""

from abc import ABC, abstractmethod

import random
import numpy as np
import librosa
from typing import Generator
from numpy.typing import NDArray
from torch import Tensor
import torch

from libdse.data.noise import DEMANDNoiseDataset, add_noise_snr

#: Type alias for a feature tensor returned by an extractor.
Sample = Tensor
#: Type alias for a label (target) tensor returned by an extractor.
Label = Tensor


[docs] class BaseExtractor(ABC): """Abstract base class for feature extractors. Defines the interface expected by :class:`~libdse.data.librispeech.LibriSpeechDataset`. Concrete subclasses must implement :meth:`__call__`, which converts a raw mono waveform into a ``(sample, label)`` tensor pair. .. attribute:: sample_shape :type: tuple[int, ...] Shape of a single feature vector produced by this extractor. Must be set in the subclass ``__init__`` before the instance is passed to :class:`~libdse.data.librispeech.LibriSpeechDataset`. """ @abstractmethod def __init__(self): self.sample_shape: tuple self.noise: DEMANDNoiseDataset pass
[docs] @abstractmethod def __call__( self, sample: NDArray[np.float32] ) -> Generator[tuple[Sample, Label], None, None]: """Yield ``(feature, label)`` pairs from a raw audio waveform. The waveform is split into non-overlapping windows; one pair is yielded for each window. The number of pairs depends on the duration of *sample* and on :attr:`sample_shape`. :param sample: Mono audio waveform at the extractor's expected sampling rate. :type sample: :class:`numpy.ndarray` of float32 :return: Generator of ``(feature, label)`` tensor pairs, each tensor having shape :attr:`sample_shape`. :rtype: Generator[tuple[:class:`torch.Tensor`, :class:`torch.Tensor`], None, None] """ pass
def _noise_for_sample( self, sample: NDArray[np.float32] ) -> NDArray[np.float32]: """Return a randomly positioned noise segment of the same length as *sample*. The start offset is drawn uniformly at random from all valid positions within the concatenated noise array so that the returned slice always fits entirely within :attr:`noise`. :param sample: Clean audio waveform. Only its length is used. :type sample: :class:`numpy.ndarray` of float32 :return: Noise segment with ``len(sample)`` samples. :rtype: :class:`numpy.ndarray` of float32 """ # Access the underlying NumPy array stored in DEMANDNoiseDataset.noise. noise_array = self.noise.noise max_noise_start = len(noise_array) - len(sample) noise_start = random.randint(0, max_noise_start) return noise_array[noise_start : noise_start + len(sample)]
[docs] class LogMelPowerSpectrumExtractor(BaseExtractor): """Log-mel power spectrum feature extractor. Converts a raw mono waveform into a sequence of log-mel power spectrogram feature vectors. The STFT is computed with a Hann window with 50% overlap; the power spectrum is projected through a mel filterbank; and the spectrogram is divided into non-overlapping windows of *chunks_per_feature* frames. Calling an instance yields one ``(feature, label)`` pair per window. When *noise* is provided, a noisy version of the waveform is synthesised by :func:`~libdse.data.noise.add_noise_snr` at a randomly selected SNR of 0, 5, or 10 dB, and every yielded pair becomes ``(noisy_feature, clean_feature)``. The extractor is designed to be instantiated **once** and called repeatedly — one call per utterance — from inside a :class:`~torch.utils.data.DataLoader`. :param sampling_rate: Expected sample rate of input waveforms in Hz. :type sampling_rate: int :param window_length: STFT window length in samples (also used as the FFT size). :type window_length: int :param hop_length: STFT hop size in samples. :type hop_length: int :param n_mels: Number of mel filterbank bins. :type n_mels: int :param chunks_per_feature: Number of consecutive spectrogram frames per output feature vector. :type chunks_per_feature: int :param noise: Optional DEMAND noise dataset used for on-the-fly noise mixing. Pass ``None`` for clean-only feature extraction. :type noise: :class:`~libdse.data.noise.DEMANDNoiseDataset` or None Example ------- .. code-block:: python extractor = MelPowerSpectrumExtractor( sampling_rate=16_000, window_length=512, hop_length=128, n_mels=40, chunks_per_feature=7, noise=None, ) for feature, label in extractor(waveform): assert feature.shape == (40 * 7,) """ def __init__( self, sampling_rate: int, window_length: int, hop_length: int, n_mels: int, chunks_per_feature: int, noise: DEMANDNoiseDataset | None, ) -> None: self.fs = sampling_rate self.window_length = window_length self.hop_length = hop_length self.n_mels = n_mels self.chunks_per_feature = chunks_per_feature self.noise = noise # Build the mel filterbank once at construction time so it is not # recomputed on every call. Shape: (n_mels, 1 + window_length // 2) self.mel_bank = librosa.filters.mel( sr=self.fs, n_fft=self.window_length, n_mels=self.n_mels ) #: Flat length of each feature vector: ``n_mels * chunks_per_feature``. self.sample_shape = (n_mels * chunks_per_feature,)
[docs] def mel_power_spectrum( self, sample: NDArray[np.float32], ) -> NDArray[np.float32]: """Compute a (log)-mel power spectrogram and split it into fixed-length chunks. Follows the feature extraction procedure described in: Lu, X. et al. (2012). *Speech Restoration Based on Deep Learning Autoencoder with Layer-Wised Pretraining*. The spectrogram is divided into non-overlapping temporal windows of :attr:`chunks_per_feature` frames. Incomplete trailing windows are discarded without padding. :param sample: Mono audio waveform. :type sample: :class:`numpy.ndarray` of float32 :return: Array of shape ``(n_chunks, n_mels * chunks_per_feature)`` where each row is a flattened temporal window. :rtype: :class:`numpy.ndarray` of float32 """ chunks = [] # STFT: complex array of shape (1 + window_length // 2, n_frames) stft = librosa.core.stft( y=sample, n_fft=self.window_length, win_length=self.window_length, hop_length=self.hop_length, window="hann", ) # Power mel spectrogram: (n_mels, n_frames) # mel_bank @ |STFT|^2 projects the power spectrum onto the mel scale. mel_spec = np.log(self.mel_bank @ np.abs(stft) ** 2 + 1e-8) # Extract non-overlapping windows along the time axis. # Each window has shape (n_mels, chunks_per_feature) and becomes one row. n_frames = mel_spec.shape[1] for i in range( 0, n_frames - self.chunks_per_feature + 1, self.chunks_per_feature, ): chunks.append(mel_spec[:, i : i + self.chunks_per_feature]) # Stack and flatten: (n_chunks, n_mels, chunks_per_feature) # → (n_chunks, n_mels * chunks_per_feature) out = np.stack(chunks, axis=0) return out.reshape((out.shape[0], -1))
[docs] def __call__( self, sample: NDArray[np.float32] ) -> Generator[tuple[Sample, Label], None, None]: """Yield ``(feature, label)`` pairs for every non-overlapping window. The waveform is converted to a mel power spectrogram, divided into non-overlapping windows of :attr:`chunks_per_feature` frames, and one pair is yielded per window. Incomplete trailing windows are discarded. When :attr:`noise` is set, a synthetic noisy copy of the waveform is blended at a randomly selected SNR of 0, 5, or 10 dB before feature extraction, and the pair becomes ``(noisy_feature, clean_feature)``. :param sample: Mono audio waveform at :attr:`fs` Hz. :type sample: :class:`numpy.ndarray` of float32 :return: Generator of ``(feature, label)`` tensor pairs, each tensor having shape ``(n_mels * chunks_per_feature,)``. :rtype: Generator[tuple[:class:`torch.Tensor`, :class:`torch.Tensor`], None, None] """ # Compute mel spectrogram chunks for the clean signal. # Shape: (n_chunks, n_mels * chunks_per_feature) mel_pspec_orig = self.mel_power_spectrum(sample) if self.noise is not None: # Blend a randomly positioned noise segment at a randomly chosen SNR. y_noise = add_noise_snr( signal=sample, noise=self._noise_for_sample(sample), snr_db=random.choice([0, 5, 10]), ) mel_pspec_noise = self.mel_power_spectrum(y_noise) for orig_row, noisy_row in zip(mel_pspec_orig, mel_pspec_noise): yield ( torch.from_numpy(noisy_row).float(), torch.from_numpy(orig_row).float(), ) else: for orig_row in mel_pspec_orig: yield ( torch.from_numpy(orig_row).float(), torch.from_numpy(orig_row).float(), )
[docs] class PowerSpectrumExtractor(BaseExtractor): """Raw magnitude power spectrum feature extractor (no mel projection). Converts a raw mono waveform into a sequence of single-sided magnitude power spectrum frames. Unlike :class:`MelPowerSpectrumExtractor`, no mel filterbank is applied — the full ``(1 + window_length // 2)``-bin power spectrum of each STFT frame is used directly as a feature vector. Calling an instance yields one ``(feature, label)`` pair per STFT frame. When *noise* is provided, a noisy version of the waveform is synthesised by :func:`~libdse.data.noise.add_noise_snr` at a randomly selected SNR of 0, 5, or 10 dB, and every yielded pair becomes ``(noisy_feature, clean_feature)``. The extractor is designed to be instantiated **once** and called repeatedly — one call per utterance — from inside a :class:`~torch.utils.data.DataLoader`. :param sampling_rate: Expected sample rate of input waveforms in Hz. :type sampling_rate: int :param window_length: STFT window length in samples (also used as the FFT size). Each feature vector has length ``1 + window_length // 2``. :type window_length: int :param hop_length: STFT hop size in samples. :type hop_length: int :param noise: Optional DEMAND noise dataset used for on-the-fly noise mixing. Pass ``None`` for clean-only feature extraction. :type noise: :class:`~libdse.data.noise.DEMANDNoiseDataset` or None Example ------- .. code-block:: python extractor = MagnitudePowerSpectrumExtractor( sampling_rate=16_000, window_length=512, hop_length=256, noise=None, ) for feature, label in extractor(waveform): assert feature.shape == (257,) # 1 + 512 // 2 """ def __init__( self, sampling_rate: int, window_length: int, hop_length: int, noise: DEMANDNoiseDataset | None, ) -> None: self.fs = sampling_rate self.window_length = window_length self.hop_length = hop_length self.noise = noise #: Flat length of each feature vector: ``1 + window_length // 2`` #: (the number of unique frequency bins in the single-sided STFT). self.sample_shape = (1 + window_length // 2,)
[docs] def magnitude_power_spectrum( self, sample: NDArray[np.float32], ) -> NDArray[np.float32]: """Compute the single-sided magnitude power spectrum frame by frame. Applies the STFT with a Hann window and returns ``|STFT|²`` — the power of each frequency bin for every frame. No mel projection is applied. :param sample: Mono audio waveform. :type sample: :class:`numpy.ndarray` of float32 :return: Array of shape ``(1 + window_length // 2, n_frames)`` where each column is the power spectrum of one STFT frame. :rtype: :class:`numpy.ndarray` of float32 """ # STFT: complex array of shape (1 + window_length // 2, n_frames) stft = librosa.core.stft( y=sample, n_fft=self.window_length, win_length=self.window_length, hop_length=self.hop_length, window="hann", ) # Power spectrogram: (1 + window_length // 2, n_frames) return np.abs(stft) ** 2
[docs] def __call__( self, sample: NDArray[np.float32] ) -> Generator[tuple[Sample, Label], None, None]: """Yield ``(feature, label)`` pairs for every STFT frame. The waveform is converted to a magnitude power spectrogram and one pair is yielded per frame (column of the spectrogram). Each tensor contains the single-sided power spectrum of that frame. When :attr:`noise` is set, a synthetic noisy copy of the waveform is blended at a randomly selected SNR of 0, 5, or 10 dB before feature extraction, and the pair becomes ``(noisy_feature, clean_feature)``. :param sample: Mono audio waveform at :attr:`fs` Hz. :type sample: :class:`numpy.ndarray` of float32 :return: Generator of ``(feature, label)`` tensor pairs, each tensor having shape ``(1 + window_length // 2,)``. :rtype: Generator[tuple[:class:`torch.Tensor`, :class:`torch.Tensor`], None, None] """ # Compute power spectrogram for the clean signal. # Shape: (1 + window_length // 2, n_frames) power_spec = self.magnitude_power_spectrum(sample) if self.noise is not None: # Blend a randomly positioned noise segment at a randomly chosen SNR. y_noise = add_noise_snr( signal=sample, noise=self._noise_for_sample(sample), snr_db=random.choice([0, 5, 10]), ) pspec_noise = self.magnitude_power_spectrum(y_noise) for orig_row, noisy_row in zip(power_spec.T, pspec_noise.T): yield ( torch.from_numpy(noisy_row).float(), torch.from_numpy(orig_row).float(), ) else: for orig_row in power_spec.T: yield ( torch.from_numpy(orig_row).float(), torch.from_numpy(orig_row).float(), )
[docs] class LogMagnitudeSpectrumExtractor(BaseExtractor): """Log-magnitude power spectrum feature extractor (no mel projection). Converts a raw mono waveform into a sequence of single-sided log magnitude spectrum frames. Calling an instance yields one ``(feature, label)`` pair per STFT frame. When *noise* is provided, a noisy version of the waveform is synthesised by :func:`~libdse.data.noise.add_noise_snr` at a randomly selected SNR of 0, 5, or 10 dB, and every yielded pair becomes ``(noisy_feature, clean_feature)``. The extractor is designed to be instantiated **once** and called repeatedly — one call per utterance — from inside a :class:`~torch.utils.data.DataLoader`. :param sampling_rate: Expected sample rate of input waveforms in Hz. :type sampling_rate: int :param window_length: STFT window length in samples (also used as the FFT size). Each feature vector has length ``1 + window_length // 2``. :type window_length: int :param hop_length: STFT hop size in samples. :type hop_length: int :param noise: Optional DEMAND noise dataset used for on-the-fly noise mixing. Pass ``None`` for clean-only feature extraction. :type noise: :class:`~libdse.data.noise.DEMANDNoiseDataset` or None Example ------- .. code-block:: python extractor = LogMagnitudeSpectrumExtractor( sampling_rate=16_000, window_length=512, hop_length=256, noise=None, ) for feature, label in extractor(waveform): assert feature.shape == (257,) # 1 + 512 // 2 """ def __init__( self, sampling_rate: int, window_length: int, hop_length: int, noise: DEMANDNoiseDataset | None, ) -> None: self.fs = sampling_rate self.window_length = window_length self.hop_length = hop_length self.noise = noise #: Flat length of each feature vector: ``1 + window_length // 2`` #: (the number of unique frequency bins in the single-sided STFT). self.sample_shape = (1 + window_length // 2,)
[docs] def log_magnitude_power_spectrum( self, sample: NDArray[np.float32], ) -> NDArray[np.float32]: """Compute the single-sided magnitude power spectrum frame by frame. Applies the STFT with a Hann window and returns ``|STFT|²`` — the power of each frequency bin for every frame. No mel projection is applied. :param sample: Mono audio waveform. :type sample: :class:`numpy.ndarray` of float32 :return: Array of shape ``(1 + window_length // 2, n_frames)`` where each column is the power spectrum of one STFT frame. :rtype: :class:`numpy.ndarray` of float32 """ # STFT: complex array of shape (1 + window_length // 2, n_frames) stft = librosa.core.stft( y=sample, n_fft=self.window_length, win_length=self.window_length, hop_length=self.hop_length, window="hann", ) # Log magnitude power spectrogram: (1 + window_length // 2, n_frames) return np.log( np.abs(stft) + 1e-10 ) # Add small constant to avoid log(0)
[docs] def __call__( self, sample: NDArray[np.float32] ) -> Generator[tuple[Sample, Label], None, None]: """Yield ``(feature, label)`` pairs for every STFT frame. The waveform is converted to a magnitude power spectrogram and one pair is yielded per frame (column of the spectrogram). Each tensor contains the single-sided power spectrum of that frame. When :attr:`noise` is set, a synthetic noisy copy of the waveform is blended at a randomly selected SNR of 0, 5, or 10 dB before feature extraction, and the pair becomes ``(noisy_feature, clean_feature)``. :param sample: Mono audio waveform at :attr:`fs` Hz. :type sample: :class:`numpy.ndarray` of float32 :return: Generator of ``(feature, label)`` tensor pairs, each tensor having shape ``(1 + window_length // 2,)``. :rtype: Generator[tuple[:class:`torch.Tensor`, :class:`torch.Tensor`], None, None] """ # Compute log magnitude power spectrogram for the clean signal. # Shape: (1 + window_length // 2, n_frames) power_spec = self.log_magnitude_power_spectrum(sample) if self.noise is not None: # Blend a randomly positioned noise segment at a randomly chosen SNR. y_noise = add_noise_snr( signal=sample, noise=self._noise_for_sample(sample), snr_db=random.choice([0, 5, 10]), ) pspec_noise = self.log_magnitude_power_spectrum(y_noise) for orig_row, noisy_row in zip(power_spec.T, pspec_noise.T): yield ( torch.from_numpy(noisy_row).float(), torch.from_numpy(orig_row).float(), ) else: for orig_row in power_spec.T: yield ( torch.from_numpy(orig_row).float(), torch.from_numpy(orig_row).float(), )
[docs] class RawWaveformExtractor(BaseExtractor): """Raw waveform extractor — no frequency transform applied. Splits a mono waveform into non-overlapping windows of *window_length* samples and yields each window directly as a feature vector. This is the natural companion extractor for time-domain models such as :class:`~libdse.nets.WaveUNet` that operate on raw audio rather than spectrograms. When *noise* is provided, a noisy mixture is generated with :func:`~libdse.data.noise.add_noise_snr` at a random SNR (0, 5, or 10 dB) and the pair becomes ``(noisy_window, clean_window)``; otherwise both elements of the pair are the same clean window. :param sampling_rate: Expected sample rate of input waveforms in Hz. :type sampling_rate: int :param window_length: Number of samples per output feature vector. :type window_length: int :param noise: Optional DEMAND noise dataset for on-the-fly noise mixing. :type noise: :class:`~libdse.data.noise.DEMANDNoiseDataset` or None """ def __init__( self, sampling_rate: int, window_length: int, noise: DEMANDNoiseDataset | None, ) -> None: self.fs = sampling_rate self.window_length = window_length self.noise = noise #: Shape of each feature vector: ``(window_length,)``. self.sample_shape = (window_length,)
[docs] def __call__( self, sample: NDArray[np.float32] ) -> Generator[tuple[Sample, Label], None, None]: """Yield ``(feature, label)`` pairs for every non-overlapping window. The waveform is zero-padded at the end when its length is not an integer multiple of *window_length*, ensuring no samples are silently dropped. :param sample: Mono audio waveform at :attr:`fs` Hz. :type sample: :class:`numpy.ndarray` of float32 :return: Generator of ``(feature, label)`` tensor pairs, each of shape ``(window_length,)``. :rtype: Generator[tuple[:class:`torch.Tensor`, :class:`torch.Tensor`], None, None] """ # Zero-pad so the waveform fills an integer number of windows. if len(sample) % self.window_length != 0: padding = self.window_length - (len(sample) % self.window_length) sample = np.pad(sample, (0, padding), mode="constant") if self.noise is not None: y_noise = add_noise_snr( signal=sample, noise=self._noise_for_sample(sample), snr_db=random.choice([0, 5, 10]), ) for i in range( 0, len(sample) - self.window_length + 1, self.window_length ): noisy_segment = y_noise[i : i + self.window_length] clean_segment = sample[i : i + self.window_length] yield ( torch.from_numpy(noisy_segment).float(), torch.from_numpy(clean_segment).float(), ) else: for i in range( 0, len(sample) - self.window_length + 1, self.window_length ): segment = sample[i : i + self.window_length] yield ( torch.from_numpy(segment).float(), torch.from_numpy(segment).float(), )