Source code for libdse.data.noise

"""Noise dataset utilities for the DEMAND corpus.

This module provides two public objects used to load and mix real-world
background noise into clean speech:

- :class:`DEMANDNoiseType` — An :class:`~enum.Enum` that maps human-readable
  environment names to the exact directory names used in the
  `DEMAND <https://doi.org/10.5281/zenodo.1227120>`_ dataset archive.
- :class:`DEMANDNoiseDataset` — Loads one or more noise environments from
  disk, concatenates them into a single array, and exposes it for slicing.
- :func:`add_noise_snr` — Mixes a noise segment into a clean signal at a
  caller-specified signal-to-noise ratio.

The DEMAND dataset contains 18 noise environments recorded at 16 kHz on
16 channels.  Only channel 1 (``ch01.wav``) is used here.

Typical usage
-------------
.. code-block:: python

    from pathlib import Path
    from dae.data.noise import DEMANDNoiseDataset, DEMANDNoiseType, add_noise_snr

    noise_ds = DEMANDNoiseDataset(
        entry_point=Path("data/noise/DEMAND"),
        noise_types=DEMANDNoiseType.ALL,
    )
    noisy = add_noise_snr(signal=clean_waveform, noise=noise_ds.noise[:len(clean_waveform)], snr_db=10)
"""

from enum import Enum
from pathlib import Path
import librosa
import numpy as np
from numpy.typing import NDArray

from libdse.data.err import EntryPointError


[docs] class DEMANDNoiseType(Enum): """Directory-name identifiers for the `DEMAND <https://doi.org/10.5281/zenodo.1227120>`_ noise dataset. Each member's value is the exact directory name inside the DEMAND archive, which follows the pattern ``<CATEGORY><NAME>_<FS>k``. Pass a subset of members (or the convenience member :attr:`ALL`) to :class:`DEMANDNoiseDataset` to control which environments are loaded. Members ------- ================ ==================== Member Directory name ================ ==================== ``KITCHEN`` ``DKITCHEN_16k`` ``LIVING`` ``DLIVING_16k`` ``WASHING`` ``DWASHING_16k`` ``FIELD`` ``NFIELD_16k`` ``PARK`` ``NPARK_16k`` ``RIVER`` ``NRIVER_16k`` ``HALLWAY`` ``OHALLWAY_16k`` ``MEETING`` ``OMEETING_16k`` ``OFFICE`` ``OOFFICE_16k`` ``CAFETERIA`` ``PCAFETER_16k`` ``RESTAURANT`` ``PRESTO_16k`` ``STATION`` ``PSTATION_16k`` ``SQUARE`` ``SPSQUARE_16k`` ``TRAFFIC`` ``STRAFFIC_16k`` ``BUS`` ``TBUS_16k`` ``CAR`` ``TCAR_16k`` ``METRO`` ``TMETRO_16k`` ``ALL`` *(all of the above)* ================ ==================== """ KITCHEN = "DKITCHEN_16k" LIVING = "DLIVING_16k" WASHING = "DWASHING_16k" FIELD = "NFIELD_16k" PARK = "NPARK_16k" RIVER = "NRIVER_16k" HALLWAY = "OHALLWAY_16k" MEETING = "OMEETING_16k" OFFICE = "OOFFICE_16k" CAFETERIA = "PCAFETER_16k" RESTAURANT = "PRESTO_16k" STATION = "PSTATION_16k" SQUARE = "SPSQUARE_16k" TRAFFIC = "STRAFFIC_16k" BUS = "TBUS_16k" CAR = "TCAR_16k" METRO = "TMETRO_16k" #: Convenience value that selects every environment at once. #: Pass ``DEMANDNoiseType.ALL`` to :class:`DEMANDNoiseDataset` to load #: all 17 DEMAND environments in a single call. ALL = [ "DKITCHEN_16k", "DLIVING_16k", "DWASHING_16k", "NFIELD_16k", "NPARK_16k", "NRIVER_16k", "OHALLWAY_16k", "OMEETING_16k", "OOFFICE_16k", "PCAFETER_16k", "PRESTO_16k", "PSTATION_16k", "SPSQUARE_16k", "STRAFFIC_16k", "TBUS_16k", "TCAR_16k", "TMETRO_16k", ]
[docs] class DEMANDNoiseDataset: """Loads and exposes DEMAND background-noise recordings as a single array. The `DEMAND <https://doi.org/10.5281/zenodo.1227120>`_ dataset contains 18 real-world noise environments, each recorded on 16 channels at 16 kHz. Only channel 1 (``ch01.wav``) is used here. All selected recordings are concatenated end-to-end into :attr:`noise` so that callers can slice arbitrary-length segments without managing individual files. :param entry_point: Directory that directly contains the per-environment sub-directories (e.g. ``DKITCHEN_16k/``, ``TCAR_16k/``, …). :type entry_point: :class:`pathlib.Path` :param noise_types: Noise environments to load. Pass a single :class:`DEMANDNoiseType` member, a list of members, or the special value :attr:`DEMANDNoiseType.ALL` to load every environment at once. Every requested type must have a matching sub-directory under *entry_point*. :type noise_types: :class:`DEMANDNoiseType` or list[:class:`DEMANDNoiseType`] :raises EntryPointError: If any requested environment directory is missing under *entry_point*. .. attribute:: noise :type: numpy.ndarray 1-D float32 array containing all noise samples concatenated in the order the environment directories were iterated. Slice this directly to obtain segments of arbitrary length. """ def __init__( self, entry_point: Path, noise_types: list[DEMANDNoiseType] | DEMANDNoiseType, sample_rate: int = 16_000, ) -> None: """Validate *entry_point* and load all requested noise recordings. :param entry_point: Root directory of the DEMAND dataset. :type entry_point: :class:`pathlib.Path` :param noise_types: Environments to include. Accepts a single :class:`DEMANDNoiseType`, a list of members, or :attr:`DEMANDNoiseType.ALL`. :type noise_types: :class:`DEMANDNoiseType` or list[:class:`DEMANDNoiseType`] :param sample_rate: Target sample rate for the loaded audio. :type sample_rate: int :raises EntryPointError: If a required environment directory is absent. """ # Each DEMAND environment folder contains 16 mono WAV files named # ch01.wav … ch16.wav. Only channel 1 is loaded here. filename = "ch01.wav" # Original DEMAND sample rate is 16 kHz, but we allow the caller to specify a different rate if they want. librosa will resample on load if needed. self.fs = sample_rate all_dirs = [d.name for d in entry_point.iterdir()] if noise_types == DEMANDNoiseType.ALL: required_noise_type_dirs = DEMANDNoiseType.ALL.value elif isinstance(noise_types, DEMANDNoiseType): required_noise_type_dirs = [noise_types.value] else: required_noise_type_dirs = [t.value for t in noise_types] if not all(d in all_dirs for d in required_noise_type_dirs): raise EntryPointError( "DEMAND entry point is missing required environment " f"directories.\n\n" f"Available: {sorted(all_dirs)}\n" f"Requested: {sorted(required_noise_type_dirs)}" ) data_dirs = [ directory for directory in entry_point.iterdir() if directory.name in required_noise_type_dirs ] self.target_files = [] for directory in data_dirs: self.target_files.extend(directory.rglob(filename)) self.noise = self._expand_noise()
[docs] def __repr__(self) -> str: """Return a concise string representation of the dataset. :return: ``DEMANDNoiseDataset(fs=F, noise_samples=N)`` :rtype: str """ return ( f"DEMANDNoiseDataset(fs={self.fs}, noise_samples={len(self.noise)})" )
def _expand_noise(self) -> NDArray[np.float32]: """Load all target WAV files and concatenate them into a single array. Called once during :meth:`__init__`. The result is stored as :attr:`noise` and is not recomputed afterwards. :return: 1-D float32 array of all noise samples concatenated in the order :attr:`target_files` was populated. :rtype: :class:`numpy.ndarray` of float32 """ samples = [] for f in self.target_files: y, _ = librosa.load(f, sr=self.fs, mono=True) samples.append(y) return np.concatenate(samples)
[docs] def add_noise_snr( signal: NDArray[np.float32], noise: NDArray[np.float32], snr_db: float ) -> NDArray[np.float32]: """Mix *noise* into *signal* at a target signal-to-noise ratio. The noise array is first padded (wrap mode) or truncated to match the length of *signal*, then scaled so that the resulting SNR equals *snr_db*. If the mixture clips (peak > 1.0) it is peak-normalised. .. math:: \\text{SNR}_{\\text{dB}} = 10 \\log_{10}\\! \\left(\\frac{P_{\\text{signal}}}{P_{\\text{noise}}}\\right) :param signal: Clean mono waveform, assumed to be in ``[-1, 1]``. :type signal: :class:`numpy.ndarray` of float32 :param noise: Noise waveform. May be shorter or longer than *signal*. :type noise: :class:`numpy.ndarray` of float32 :param snr_db: Desired signal-to-noise ratio in decibels. :type snr_db: float :return: Noisy mixture with the same length as *signal*, peak-normalised if clipping occurs. :rtype: :class:`numpy.ndarray` of float32 """ # Match noise length to signal (wrap-pad if shorter, truncate if longer). if len(noise) < len(signal): noise = np.pad(noise, (0, len(signal) - len(noise)), "wrap") else: noise = noise[: len(signal)] # Power = mean squared amplitude. p_signal = np.mean(signal**2) p_noise = np.mean(noise**2) # Derive the noise power that satisfies the target SNR, then scale. p_target_noise = p_signal / (10 ** (snr_db / 10)) scaling_factor = np.sqrt(p_target_noise / p_noise) noisy_signal = signal + (noise * scaling_factor) # Peak-normalise to prevent clipping. max_val = np.max(np.abs(noisy_signal)) if max_val > 1.0: noisy_signal = noisy_signal / max_val return noisy_signal