"""Noise dataset utilities for the DEMAND corpus.
This module provides two public objects used to load and mix real-world
background noise into clean speech:
- :class:`DEMANDNoiseType` — An :class:`~enum.Enum` that maps human-readable
environment names to the exact directory names used in the
`DEMAND <https://doi.org/10.5281/zenodo.1227120>`_ dataset archive.
- :class:`DEMANDNoiseDataset` — Loads one or more noise environments from
disk, concatenates them into a single array, and exposes it for slicing.
- :func:`add_noise_snr` — Mixes a noise segment into a clean signal at a
caller-specified signal-to-noise ratio.
The DEMAND dataset contains 18 noise environments recorded at 16 kHz on
16 channels. Only channel 1 (``ch01.wav``) is used here.
Typical usage
-------------
.. code-block:: python
from pathlib import Path
from dae.data.noise import DEMANDNoiseDataset, DEMANDNoiseType, add_noise_snr
noise_ds = DEMANDNoiseDataset(
entry_point=Path("data/noise/DEMAND"),
noise_types=DEMANDNoiseType.ALL,
)
noisy = add_noise_snr(signal=clean_waveform, noise=noise_ds.noise[:len(clean_waveform)], snr_db=10)
"""
from enum import Enum
from pathlib import Path
import librosa
import numpy as np
from numpy.typing import NDArray
from libdse.data.err import EntryPointError
[docs]
class DEMANDNoiseType(Enum):
"""Directory-name identifiers for the `DEMAND <https://doi.org/10.5281/zenodo.1227120>`_ noise dataset.
Each member's value is the exact directory name inside the DEMAND archive,
which follows the pattern ``<CATEGORY><NAME>_<FS>k``.
Pass a subset of members (or the convenience member :attr:`ALL`) to
:class:`DEMANDNoiseDataset` to control which environments are loaded.
Members
-------
================ ====================
Member Directory name
================ ====================
``KITCHEN`` ``DKITCHEN_16k``
``LIVING`` ``DLIVING_16k``
``WASHING`` ``DWASHING_16k``
``FIELD`` ``NFIELD_16k``
``PARK`` ``NPARK_16k``
``RIVER`` ``NRIVER_16k``
``HALLWAY`` ``OHALLWAY_16k``
``MEETING`` ``OMEETING_16k``
``OFFICE`` ``OOFFICE_16k``
``CAFETERIA`` ``PCAFETER_16k``
``RESTAURANT`` ``PRESTO_16k``
``STATION`` ``PSTATION_16k``
``SQUARE`` ``SPSQUARE_16k``
``TRAFFIC`` ``STRAFFIC_16k``
``BUS`` ``TBUS_16k``
``CAR`` ``TCAR_16k``
``METRO`` ``TMETRO_16k``
``ALL`` *(all of the above)*
================ ====================
"""
KITCHEN = "DKITCHEN_16k"
LIVING = "DLIVING_16k"
WASHING = "DWASHING_16k"
FIELD = "NFIELD_16k"
PARK = "NPARK_16k"
RIVER = "NRIVER_16k"
HALLWAY = "OHALLWAY_16k"
MEETING = "OMEETING_16k"
OFFICE = "OOFFICE_16k"
CAFETERIA = "PCAFETER_16k"
RESTAURANT = "PRESTO_16k"
STATION = "PSTATION_16k"
SQUARE = "SPSQUARE_16k"
TRAFFIC = "STRAFFIC_16k"
BUS = "TBUS_16k"
CAR = "TCAR_16k"
METRO = "TMETRO_16k"
#: Convenience value that selects every environment at once.
#: Pass ``DEMANDNoiseType.ALL`` to :class:`DEMANDNoiseDataset` to load
#: all 17 DEMAND environments in a single call.
ALL = [
"DKITCHEN_16k",
"DLIVING_16k",
"DWASHING_16k",
"NFIELD_16k",
"NPARK_16k",
"NRIVER_16k",
"OHALLWAY_16k",
"OMEETING_16k",
"OOFFICE_16k",
"PCAFETER_16k",
"PRESTO_16k",
"PSTATION_16k",
"SPSQUARE_16k",
"STRAFFIC_16k",
"TBUS_16k",
"TCAR_16k",
"TMETRO_16k",
]
[docs]
class DEMANDNoiseDataset:
"""Loads and exposes DEMAND background-noise recordings as a single array.
The `DEMAND <https://doi.org/10.5281/zenodo.1227120>`_ dataset contains
18 real-world noise environments, each recorded on 16 channels at 16 kHz.
Only channel 1 (``ch01.wav``) is used here. All selected recordings are
concatenated end-to-end into :attr:`noise` so that callers can slice
arbitrary-length segments without managing individual files.
:param entry_point: Directory that directly contains the per-environment
sub-directories (e.g. ``DKITCHEN_16k/``, ``TCAR_16k/``, …).
:type entry_point: :class:`pathlib.Path`
:param noise_types: Noise environments to load. Pass a single
:class:`DEMANDNoiseType` member, a list of members, or the special
value :attr:`DEMANDNoiseType.ALL` to load every environment at once.
Every requested type must have a matching sub-directory under
*entry_point*.
:type noise_types: :class:`DEMANDNoiseType` or list[:class:`DEMANDNoiseType`]
:raises EntryPointError: If any requested environment directory is missing
under *entry_point*.
.. attribute:: noise
:type: numpy.ndarray
1-D float32 array containing all noise samples concatenated in the
order the environment directories were iterated. Slice this directly
to obtain segments of arbitrary length.
"""
def __init__(
self,
entry_point: Path,
noise_types: list[DEMANDNoiseType] | DEMANDNoiseType,
sample_rate: int = 16_000,
) -> None:
"""Validate *entry_point* and load all requested noise recordings.
:param entry_point: Root directory of the DEMAND dataset.
:type entry_point: :class:`pathlib.Path`
:param noise_types: Environments to include. Accepts a single
:class:`DEMANDNoiseType`, a list of members, or
:attr:`DEMANDNoiseType.ALL`.
:type noise_types: :class:`DEMANDNoiseType` or list[:class:`DEMANDNoiseType`]
:param sample_rate: Target sample rate for the loaded audio.
:type sample_rate: int
:raises EntryPointError: If a required environment directory is absent.
"""
# Each DEMAND environment folder contains 16 mono WAV files named
# ch01.wav … ch16.wav. Only channel 1 is loaded here.
filename = "ch01.wav"
# Original DEMAND sample rate is 16 kHz, but we allow the caller to specify a different rate if they want. librosa will resample on load if needed.
self.fs = sample_rate
all_dirs = [d.name for d in entry_point.iterdir()]
if noise_types == DEMANDNoiseType.ALL:
required_noise_type_dirs = DEMANDNoiseType.ALL.value
elif isinstance(noise_types, DEMANDNoiseType):
required_noise_type_dirs = [noise_types.value]
else:
required_noise_type_dirs = [t.value for t in noise_types]
if not all(d in all_dirs for d in required_noise_type_dirs):
raise EntryPointError(
"DEMAND entry point is missing required environment "
f"directories.\n\n"
f"Available: {sorted(all_dirs)}\n"
f"Requested: {sorted(required_noise_type_dirs)}"
)
data_dirs = [
directory
for directory in entry_point.iterdir()
if directory.name in required_noise_type_dirs
]
self.target_files = []
for directory in data_dirs:
self.target_files.extend(directory.rglob(filename))
self.noise = self._expand_noise()
[docs]
def __repr__(self) -> str:
"""Return a concise string representation of the dataset.
:return: ``DEMANDNoiseDataset(fs=F, noise_samples=N)``
:rtype: str
"""
return (
f"DEMANDNoiseDataset(fs={self.fs}, noise_samples={len(self.noise)})"
)
def _expand_noise(self) -> NDArray[np.float32]:
"""Load all target WAV files and concatenate them into a single array.
Called once during :meth:`__init__`. The result is stored as
:attr:`noise` and is not recomputed afterwards.
:return: 1-D float32 array of all noise samples concatenated in the
order :attr:`target_files` was populated.
:rtype: :class:`numpy.ndarray` of float32
"""
samples = []
for f in self.target_files:
y, _ = librosa.load(f, sr=self.fs, mono=True)
samples.append(y)
return np.concatenate(samples)
[docs]
def add_noise_snr(
signal: NDArray[np.float32], noise: NDArray[np.float32], snr_db: float
) -> NDArray[np.float32]:
"""Mix *noise* into *signal* at a target signal-to-noise ratio.
The noise array is first padded (wrap mode) or truncated to match the
length of *signal*, then scaled so that the resulting SNR equals
*snr_db*. If the mixture clips (peak > 1.0) it is peak-normalised.
.. math::
\\text{SNR}_{\\text{dB}} = 10 \\log_{10}\\!
\\left(\\frac{P_{\\text{signal}}}{P_{\\text{noise}}}\\right)
:param signal: Clean mono waveform, assumed to be in ``[-1, 1]``.
:type signal: :class:`numpy.ndarray` of float32
:param noise: Noise waveform. May be shorter or longer than *signal*.
:type noise: :class:`numpy.ndarray` of float32
:param snr_db: Desired signal-to-noise ratio in decibels.
:type snr_db: float
:return: Noisy mixture with the same length as *signal*, peak-normalised
if clipping occurs.
:rtype: :class:`numpy.ndarray` of float32
"""
# Match noise length to signal (wrap-pad if shorter, truncate if longer).
if len(noise) < len(signal):
noise = np.pad(noise, (0, len(signal) - len(noise)), "wrap")
else:
noise = noise[: len(signal)]
# Power = mean squared amplitude.
p_signal = np.mean(signal**2)
p_noise = np.mean(noise**2)
# Derive the noise power that satisfies the target SNR, then scale.
p_target_noise = p_signal / (10 ** (snr_db / 10))
scaling_factor = np.sqrt(p_target_noise / p_noise)
noisy_signal = signal + (noise * scaling_factor)
# Peak-normalise to prevent clipping.
max_val = np.max(np.abs(noisy_signal))
if max_val > 1.0:
noisy_signal = noisy_signal / max_val
return noisy_signal