Source code for libdse.data.features
"""Feature extraction utilities for log-mel power spectrograms.
This module provides the abstract :class:`BaseExtractor` interface and the
concrete :class:`MelPowerSpectrumExtractor` implementation used to build
training samples for the denoising autoencoder (DAE).
The feature pipeline converts raw mono waveforms into fixed-width log-mel
power spectrogram vectors following the approach described in:
Lu, X., Tsao, Y., Matsuda, S., & Hori, C. (2013). *Speech enhancement
based on deep denoising autoencoder*. INTERSPEECH 2013.
Pipeline summary
----------------
1. Compute the short-time Fourier transform (STFT) with a Hann window.
2. Project the magnitude-squared spectrum onto a mel filterbank.
3. Divide the resulting mel spectrogram into non-overlapping temporal windows
of *chunks_per_feature* frames; discard incomplete trailing windows.
4. Flatten each window into a 1-D vector of length
``n_mels * chunks_per_feature``.
When a :class:`~libdse.data.noise.DEMANDNoiseDataset` is supplied to
:class:`MelPowerSpectrumExtractor`, a noisy copy of the waveform is
synthesised on the fly by :func:`~libdse.data.noise.add_noise_snr`. The
returned training pair is then ``(noisy_feature, clean_feature)`` instead of
``(clean_feature, clean_feature)``.
Classes
-------
- :class:`BaseExtractor` — Abstract base; subclass to define custom extractors.
- :class:`MelPowerSpectrumExtractor` — Log-mel power spectrum extractor.
- :class:`MagnitudePowerSpectrumExtractor` — Raw magnitude power spectrum extractor.
Typical usage
-------------
.. code-block:: python
from pathlib import Path
from libdse.data.features import MelPowerSpectrumExtractor
from libdse.data.noise import DEMANDNoiseDataset, DEMANDNoiseType
noise_ds = DEMANDNoiseDataset(
entry_point=Path("data/noise/DEMAND"),
noise_types=DEMANDNoiseType.ALL,
)
extractor = MelPowerSpectrumExtractor(
sampling_rate=16_000,
window_length=512,
hop_length=128,
n_mels=40,
chunks_per_feature=7,
noise=noise_ds,
)
# Called once per utterance inside a DataLoader worker — yields one pair
# per non-overlapping spectrogram window:
for noisy_feat, clean_feat in extractor(waveform):
...
"""
from abc import ABC, abstractmethod
import random
import numpy as np
import librosa
from typing import Generator
from numpy.typing import NDArray
from torch import Tensor
import torch
from libdse.data.noise import DEMANDNoiseDataset, add_noise_snr
#: Type alias for a feature tensor returned by an extractor.
Sample = Tensor
#: Type alias for a label (target) tensor returned by an extractor.
Label = Tensor
[docs]
class BaseExtractor(ABC):
"""Abstract base class for feature extractors.
Defines the interface expected by
:class:`~libdse.data.librispeech.LibriSpeechDataset`. Concrete subclasses
must implement :meth:`__call__`, which converts a raw mono waveform into a
``(sample, label)`` tensor pair.
.. attribute:: sample_shape
:type: tuple[int, ...]
Shape of a single feature vector produced by this extractor. Must be
set in the subclass ``__init__`` before the instance is passed to
:class:`~libdse.data.librispeech.LibriSpeechDataset`.
"""
@abstractmethod
def __init__(self):
self.sample_shape: tuple
self.noise: DEMANDNoiseDataset
pass
[docs]
@abstractmethod
def __call__(
self, sample: NDArray[np.float32]
) -> Generator[tuple[Sample, Label], None, None]:
"""Yield ``(feature, label)`` pairs from a raw audio waveform.
The waveform is split into non-overlapping windows; one pair is
yielded for each window. The number of pairs depends on the
duration of *sample* and on :attr:`sample_shape`.
:param sample: Mono audio waveform at the extractor's expected
sampling rate.
:type sample: :class:`numpy.ndarray` of float32
:return: Generator of ``(feature, label)`` tensor pairs, each tensor
having shape :attr:`sample_shape`.
:rtype: Generator[tuple[:class:`torch.Tensor`, :class:`torch.Tensor`], None, None]
"""
pass
def _noise_for_sample(
self, sample: NDArray[np.float32]
) -> NDArray[np.float32]:
"""Return a randomly positioned noise segment of the same length as *sample*.
The start offset is drawn uniformly at random from all valid positions
within the concatenated noise array so that the returned slice always
fits entirely within :attr:`noise`.
:param sample: Clean audio waveform. Only its length is used.
:type sample: :class:`numpy.ndarray` of float32
:return: Noise segment with ``len(sample)`` samples.
:rtype: :class:`numpy.ndarray` of float32
"""
# Access the underlying NumPy array stored in DEMANDNoiseDataset.noise.
noise_array = self.noise.noise
max_noise_start = len(noise_array) - len(sample)
noise_start = random.randint(0, max_noise_start)
return noise_array[noise_start : noise_start + len(sample)]
[docs]
class LogMelPowerSpectrumExtractor(BaseExtractor):
"""Log-mel power spectrum feature extractor.
Converts a raw mono waveform into a sequence of log-mel power spectrogram
feature vectors. The STFT is computed with a Hann window with 50% overlap;
the power spectrum is projected through a mel filterbank; and the
spectrogram is divided into non-overlapping windows of *chunks_per_feature*
frames. Calling an instance yields one ``(feature, label)`` pair per window.
When *noise* is provided, a noisy version of the waveform is synthesised
by :func:`~libdse.data.noise.add_noise_snr` at a randomly selected SNR of
0, 5, or 10 dB, and every yielded pair becomes
``(noisy_feature, clean_feature)``.
The extractor is designed to be instantiated **once** and called
repeatedly — one call per utterance — from inside a
:class:`~torch.utils.data.DataLoader`.
:param sampling_rate: Expected sample rate of input waveforms in Hz.
:type sampling_rate: int
:param window_length: STFT window length in samples (also used as the
FFT size).
:type window_length: int
:param hop_length: STFT hop size in samples.
:type hop_length: int
:param n_mels: Number of mel filterbank bins.
:type n_mels: int
:param chunks_per_feature: Number of consecutive spectrogram frames per
output feature vector.
:type chunks_per_feature: int
:param noise: Optional DEMAND noise dataset used for on-the-fly noise
mixing. Pass ``None`` for clean-only feature extraction.
:type noise: :class:`~libdse.data.noise.DEMANDNoiseDataset` or None
Example
-------
.. code-block:: python
extractor = MelPowerSpectrumExtractor(
sampling_rate=16_000,
window_length=512,
hop_length=128,
n_mels=40,
chunks_per_feature=7,
noise=None,
)
for feature, label in extractor(waveform):
assert feature.shape == (40 * 7,)
"""
def __init__(
self,
sampling_rate: int,
window_length: int,
hop_length: int,
n_mels: int,
chunks_per_feature: int,
noise: DEMANDNoiseDataset | None,
) -> None:
self.fs = sampling_rate
self.window_length = window_length
self.hop_length = hop_length
self.n_mels = n_mels
self.chunks_per_feature = chunks_per_feature
self.noise = noise
# Build the mel filterbank once at construction time so it is not
# recomputed on every call. Shape: (n_mels, 1 + window_length // 2)
self.mel_bank = librosa.filters.mel(
sr=self.fs, n_fft=self.window_length, n_mels=self.n_mels
)
#: Flat length of each feature vector: ``n_mels * chunks_per_feature``.
self.sample_shape = (n_mels * chunks_per_feature,)
[docs]
def mel_power_spectrum(
self,
sample: NDArray[np.float32],
) -> NDArray[np.float32]:
"""Compute a (log)-mel power spectrogram and split it into fixed-length chunks.
Follows the feature extraction procedure described in:
Lu, X. et al. (2012). *Speech Restoration Based on Deep Learning
Autoencoder with Layer-Wised Pretraining*.
The spectrogram is divided into non-overlapping temporal windows of
:attr:`chunks_per_feature` frames. Incomplete trailing windows are
discarded without padding.
:param sample: Mono audio waveform.
:type sample: :class:`numpy.ndarray` of float32
:return: Array of shape ``(n_chunks, n_mels * chunks_per_feature)``
where each row is a flattened temporal window.
:rtype: :class:`numpy.ndarray` of float32
"""
chunks = []
# STFT: complex array of shape (1 + window_length // 2, n_frames)
stft = librosa.core.stft(
y=sample,
n_fft=self.window_length,
win_length=self.window_length,
hop_length=self.hop_length,
window="hann",
)
# Power mel spectrogram: (n_mels, n_frames)
# mel_bank @ |STFT|^2 projects the power spectrum onto the mel scale.
mel_spec = np.log(self.mel_bank @ np.abs(stft) ** 2 + 1e-8)
# Extract non-overlapping windows along the time axis.
# Each window has shape (n_mels, chunks_per_feature) and becomes one row.
n_frames = mel_spec.shape[1]
for i in range(
0,
n_frames - self.chunks_per_feature + 1,
self.chunks_per_feature,
):
chunks.append(mel_spec[:, i : i + self.chunks_per_feature])
# Stack and flatten: (n_chunks, n_mels, chunks_per_feature)
# → (n_chunks, n_mels * chunks_per_feature)
out = np.stack(chunks, axis=0)
return out.reshape((out.shape[0], -1))
[docs]
def __call__(
self, sample: NDArray[np.float32]
) -> Generator[tuple[Sample, Label], None, None]:
"""Yield ``(feature, label)`` pairs for every non-overlapping window.
The waveform is converted to a mel power spectrogram, divided into
non-overlapping windows of :attr:`chunks_per_feature` frames, and one
pair is yielded per window. Incomplete trailing windows are discarded.
When :attr:`noise` is set, a synthetic noisy copy of the waveform is
blended at a randomly selected SNR of 0, 5, or 10 dB before feature
extraction, and the pair becomes ``(noisy_feature, clean_feature)``.
:param sample: Mono audio waveform at :attr:`fs` Hz.
:type sample: :class:`numpy.ndarray` of float32
:return: Generator of ``(feature, label)`` tensor pairs, each tensor
having shape ``(n_mels * chunks_per_feature,)``.
:rtype: Generator[tuple[:class:`torch.Tensor`, :class:`torch.Tensor`], None, None]
"""
# Compute mel spectrogram chunks for the clean signal.
# Shape: (n_chunks, n_mels * chunks_per_feature)
mel_pspec_orig = self.mel_power_spectrum(sample)
if self.noise is not None:
# Blend a randomly positioned noise segment at a randomly chosen SNR.
y_noise = add_noise_snr(
signal=sample,
noise=self._noise_for_sample(sample),
snr_db=random.choice([0, 5, 10]),
)
mel_pspec_noise = self.mel_power_spectrum(y_noise)
for orig_row, noisy_row in zip(mel_pspec_orig, mel_pspec_noise):
yield (
torch.from_numpy(noisy_row).float(),
torch.from_numpy(orig_row).float(),
)
else:
for orig_row in mel_pspec_orig:
yield (
torch.from_numpy(orig_row).float(),
torch.from_numpy(orig_row).float(),
)
[docs]
class PowerSpectrumExtractor(BaseExtractor):
"""Raw magnitude power spectrum feature extractor (no mel projection).
Converts a raw mono waveform into a sequence of single-sided magnitude
power spectrum frames. Unlike :class:`MelPowerSpectrumExtractor`, no mel
filterbank is applied — the full ``(1 + window_length // 2)``-bin power
spectrum of each STFT frame is used directly as a feature vector.
Calling an instance yields one ``(feature, label)`` pair per STFT frame.
When *noise* is provided, a noisy version of the waveform is synthesised
by :func:`~libdse.data.noise.add_noise_snr` at a randomly selected SNR of
0, 5, or 10 dB, and every yielded pair becomes
``(noisy_feature, clean_feature)``.
The extractor is designed to be instantiated **once** and called
repeatedly — one call per utterance — from inside a
:class:`~torch.utils.data.DataLoader`.
:param sampling_rate: Expected sample rate of input waveforms in Hz.
:type sampling_rate: int
:param window_length: STFT window length in samples (also used as the
FFT size). Each feature vector has length ``1 + window_length // 2``.
:type window_length: int
:param hop_length: STFT hop size in samples.
:type hop_length: int
:param noise: Optional DEMAND noise dataset used for on-the-fly noise
mixing. Pass ``None`` for clean-only feature extraction.
:type noise: :class:`~libdse.data.noise.DEMANDNoiseDataset` or None
Example
-------
.. code-block:: python
extractor = MagnitudePowerSpectrumExtractor(
sampling_rate=16_000,
window_length=512,
hop_length=256,
noise=None,
)
for feature, label in extractor(waveform):
assert feature.shape == (257,) # 1 + 512 // 2
"""
def __init__(
self,
sampling_rate: int,
window_length: int,
hop_length: int,
noise: DEMANDNoiseDataset | None,
) -> None:
self.fs = sampling_rate
self.window_length = window_length
self.hop_length = hop_length
self.noise = noise
#: Flat length of each feature vector: ``1 + window_length // 2``
#: (the number of unique frequency bins in the single-sided STFT).
self.sample_shape = (1 + window_length // 2,)
[docs]
def magnitude_power_spectrum(
self,
sample: NDArray[np.float32],
) -> NDArray[np.float32]:
"""Compute the single-sided magnitude power spectrum frame by frame.
Applies the STFT with a Hann window and returns ``|STFT|²`` — the
power of each frequency bin for every frame. No mel projection is
applied.
:param sample: Mono audio waveform.
:type sample: :class:`numpy.ndarray` of float32
:return: Array of shape ``(1 + window_length // 2, n_frames)`` where
each column is the power spectrum of one STFT frame.
:rtype: :class:`numpy.ndarray` of float32
"""
# STFT: complex array of shape (1 + window_length // 2, n_frames)
stft = librosa.core.stft(
y=sample,
n_fft=self.window_length,
win_length=self.window_length,
hop_length=self.hop_length,
window="hann",
)
# Power spectrogram: (1 + window_length // 2, n_frames)
return np.abs(stft) ** 2
[docs]
def __call__(
self, sample: NDArray[np.float32]
) -> Generator[tuple[Sample, Label], None, None]:
"""Yield ``(feature, label)`` pairs for every STFT frame.
The waveform is converted to a magnitude power spectrogram and one
pair is yielded per frame (column of the spectrogram). Each tensor
contains the single-sided power spectrum of that frame.
When :attr:`noise` is set, a synthetic noisy copy of the waveform is
blended at a randomly selected SNR of 0, 5, or 10 dB before feature
extraction, and the pair becomes ``(noisy_feature, clean_feature)``.
:param sample: Mono audio waveform at :attr:`fs` Hz.
:type sample: :class:`numpy.ndarray` of float32
:return: Generator of ``(feature, label)`` tensor pairs, each tensor
having shape ``(1 + window_length // 2,)``.
:rtype: Generator[tuple[:class:`torch.Tensor`, :class:`torch.Tensor`], None, None]
"""
# Compute power spectrogram for the clean signal.
# Shape: (1 + window_length // 2, n_frames)
power_spec = self.magnitude_power_spectrum(sample)
if self.noise is not None:
# Blend a randomly positioned noise segment at a randomly chosen SNR.
y_noise = add_noise_snr(
signal=sample,
noise=self._noise_for_sample(sample),
snr_db=random.choice([0, 5, 10]),
)
pspec_noise = self.magnitude_power_spectrum(y_noise)
for orig_row, noisy_row in zip(power_spec.T, pspec_noise.T):
yield (
torch.from_numpy(noisy_row).float(),
torch.from_numpy(orig_row).float(),
)
else:
for orig_row in power_spec.T:
yield (
torch.from_numpy(orig_row).float(),
torch.from_numpy(orig_row).float(),
)
[docs]
class LogMagnitudeSpectrumExtractor(BaseExtractor):
"""Log-magnitude power spectrum feature extractor (no mel projection).
Converts a raw mono waveform into a sequence of single-sided log magnitude
spectrum frames.
Calling an instance yields one ``(feature, label)`` pair per STFT frame.
When *noise* is provided, a noisy version of the waveform is synthesised
by :func:`~libdse.data.noise.add_noise_snr` at a randomly selected SNR of
0, 5, or 10 dB, and every yielded pair becomes
``(noisy_feature, clean_feature)``.
The extractor is designed to be instantiated **once** and called
repeatedly — one call per utterance — from inside a
:class:`~torch.utils.data.DataLoader`.
:param sampling_rate: Expected sample rate of input waveforms in Hz.
:type sampling_rate: int
:param window_length: STFT window length in samples (also used as the
FFT size). Each feature vector has length ``1 + window_length // 2``.
:type window_length: int
:param hop_length: STFT hop size in samples.
:type hop_length: int
:param noise: Optional DEMAND noise dataset used for on-the-fly noise
mixing. Pass ``None`` for clean-only feature extraction.
:type noise: :class:`~libdse.data.noise.DEMANDNoiseDataset` or None
Example
-------
.. code-block:: python
extractor = LogMagnitudeSpectrumExtractor(
sampling_rate=16_000,
window_length=512,
hop_length=256,
noise=None,
)
for feature, label in extractor(waveform):
assert feature.shape == (257,) # 1 + 512 // 2
"""
def __init__(
self,
sampling_rate: int,
window_length: int,
hop_length: int,
noise: DEMANDNoiseDataset | None,
) -> None:
self.fs = sampling_rate
self.window_length = window_length
self.hop_length = hop_length
self.noise = noise
#: Flat length of each feature vector: ``1 + window_length // 2``
#: (the number of unique frequency bins in the single-sided STFT).
self.sample_shape = (1 + window_length // 2,)
[docs]
def log_magnitude_power_spectrum(
self,
sample: NDArray[np.float32],
) -> NDArray[np.float32]:
"""Compute the single-sided magnitude power spectrum frame by frame.
Applies the STFT with a Hann window and returns ``|STFT|²`` — the
power of each frequency bin for every frame. No mel projection is
applied.
:param sample: Mono audio waveform.
:type sample: :class:`numpy.ndarray` of float32
:return: Array of shape ``(1 + window_length // 2, n_frames)`` where
each column is the power spectrum of one STFT frame.
:rtype: :class:`numpy.ndarray` of float32
"""
# STFT: complex array of shape (1 + window_length // 2, n_frames)
stft = librosa.core.stft(
y=sample,
n_fft=self.window_length,
win_length=self.window_length,
hop_length=self.hop_length,
window="hann",
)
# Log magnitude power spectrogram: (1 + window_length // 2, n_frames)
return np.log(
np.abs(stft) + 1e-10
) # Add small constant to avoid log(0)
[docs]
def __call__(
self, sample: NDArray[np.float32]
) -> Generator[tuple[Sample, Label], None, None]:
"""Yield ``(feature, label)`` pairs for every STFT frame.
The waveform is converted to a magnitude power spectrogram and one
pair is yielded per frame (column of the spectrogram). Each tensor
contains the single-sided power spectrum of that frame.
When :attr:`noise` is set, a synthetic noisy copy of the waveform is
blended at a randomly selected SNR of 0, 5, or 10 dB before feature
extraction, and the pair becomes ``(noisy_feature, clean_feature)``.
:param sample: Mono audio waveform at :attr:`fs` Hz.
:type sample: :class:`numpy.ndarray` of float32
:return: Generator of ``(feature, label)`` tensor pairs, each tensor
having shape ``(1 + window_length // 2,)``.
:rtype: Generator[tuple[:class:`torch.Tensor`, :class:`torch.Tensor`], None, None]
"""
# Compute log magnitude power spectrogram for the clean signal.
# Shape: (1 + window_length // 2, n_frames)
power_spec = self.log_magnitude_power_spectrum(sample)
if self.noise is not None:
# Blend a randomly positioned noise segment at a randomly chosen SNR.
y_noise = add_noise_snr(
signal=sample,
noise=self._noise_for_sample(sample),
snr_db=random.choice([0, 5, 10]),
)
pspec_noise = self.log_magnitude_power_spectrum(y_noise)
for orig_row, noisy_row in zip(power_spec.T, pspec_noise.T):
yield (
torch.from_numpy(noisy_row).float(),
torch.from_numpy(orig_row).float(),
)
else:
for orig_row in power_spec.T:
yield (
torch.from_numpy(orig_row).float(),
torch.from_numpy(orig_row).float(),
)
[docs]
class RawWaveformExtractor(BaseExtractor):
"""Raw waveform extractor — no frequency transform applied.
Splits a mono waveform into non-overlapping windows of *window_length*
samples and yields each window directly as a feature vector. This is the
natural companion extractor for time-domain models such as
:class:`~libdse.nets.WaveUNet` that operate on raw audio rather than
spectrograms.
When *noise* is provided, a noisy mixture is generated with
:func:`~libdse.data.noise.add_noise_snr` at a random SNR (0, 5, or 10 dB)
and the pair becomes ``(noisy_window, clean_window)``; otherwise both
elements of the pair are the same clean window.
:param sampling_rate: Expected sample rate of input waveforms in Hz.
:type sampling_rate: int
:param window_length: Number of samples per output feature vector.
:type window_length: int
:param noise: Optional DEMAND noise dataset for on-the-fly noise mixing.
:type noise: :class:`~libdse.data.noise.DEMANDNoiseDataset` or None
"""
def __init__(
self,
sampling_rate: int,
window_length: int,
noise: DEMANDNoiseDataset | None,
) -> None:
self.fs = sampling_rate
self.window_length = window_length
self.noise = noise
#: Shape of each feature vector: ``(window_length,)``.
self.sample_shape = (window_length,)
[docs]
def __call__(
self, sample: NDArray[np.float32]
) -> Generator[tuple[Sample, Label], None, None]:
"""Yield ``(feature, label)`` pairs for every non-overlapping window.
The waveform is zero-padded at the end when its length is not an
integer multiple of *window_length*, ensuring no samples are silently
dropped.
:param sample: Mono audio waveform at :attr:`fs` Hz.
:type sample: :class:`numpy.ndarray` of float32
:return: Generator of ``(feature, label)`` tensor pairs, each of shape
``(window_length,)``.
:rtype: Generator[tuple[:class:`torch.Tensor`, :class:`torch.Tensor`], None, None]
"""
# Zero-pad so the waveform fills an integer number of windows.
if len(sample) % self.window_length != 0:
padding = self.window_length - (len(sample) % self.window_length)
sample = np.pad(sample, (0, padding), mode="constant")
if self.noise is not None:
y_noise = add_noise_snr(
signal=sample,
noise=self._noise_for_sample(sample),
snr_db=random.choice([0, 5, 10]),
)
for i in range(
0, len(sample) - self.window_length + 1, self.window_length
):
noisy_segment = y_noise[i : i + self.window_length]
clean_segment = sample[i : i + self.window_length]
yield (
torch.from_numpy(noisy_segment).float(),
torch.from_numpy(clean_segment).float(),
)
else:
for i in range(
0, len(sample) - self.window_length + 1, self.window_length
):
segment = sample[i : i + self.window_length]
yield (
torch.from_numpy(segment).float(),
torch.from_numpy(segment).float(),
)