Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions stable_datasets/tests/timeseries/test_librispeech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import pytest

from stable_datasets.timeseries.librispeech import LibriSpeech


@pytest.mark.large
def test_librispeech_train_split():
"""Test the train-clean-100 split of LibriSpeech.

This test downloads ~6.3GB of data and may take several minutes.
Run with: pytest -m large
"""
ds = LibriSpeech(split="train")

# Test 1: Dataset should have ~28,539 utterances in train-clean-100
assert len(ds) > 25000, f"Expected >25,000 training samples, got {len(ds)}."

# Test 2: Check that each sample has the expected keys
sample = ds[0]
expected_keys = {"audio", "sample_rate", "speaker_id", "transcript"}
assert set(sample.keys()) == expected_keys, (
f"Expected keys {expected_keys}, got {set(sample.keys())}"
)

# Test 3: Validate audio type (should be a list of floats)
audio = sample["audio"]
assert isinstance(audio, list), f"Audio should be a list, got {type(audio)}."
assert len(audio) > 0, "Audio waveform should not be empty."
assert isinstance(audio[0], float), f"Audio samples should be floats, got {type(audio[0])}."

# Test 4: Validate sample rate (LibriSpeech is 16kHz)
assert sample["sample_rate"] == 16000, (
f"Sample rate should be 16000, got {sample['sample_rate']}."
)

# Test 5: Validate speaker_id is a positive integer
speaker_id = sample["speaker_id"]
assert isinstance(speaker_id, int), f"Speaker ID should be int, got {type(speaker_id)}."
assert speaker_id > 0, f"Speaker ID should be positive, got {speaker_id}."

# Test 6: Validate transcript is a non-empty string
transcript = sample["transcript"]
assert isinstance(transcript, str), f"Transcript should be a string, got {type(transcript)}."
assert len(transcript) > 0, "Transcript should not be empty."

print(f"All LibriSpeech train tests passed! ({len(ds)} samples)")


@pytest.mark.large
def test_librispeech_test_split():
"""Test the test-clean split of LibriSpeech."""
ds = LibriSpeech(split="test")

# test-clean has ~2,620 utterances
assert len(ds) > 2000, f"Expected >2,000 test samples, got {len(ds)}."

sample = ds[0]
assert "audio" in sample
assert "transcript" in sample
assert sample["sample_rate"] == 16000

print(f"All LibriSpeech test tests passed! ({len(ds)} samples)")


def test_librispeech_returns_dataset_dict_when_no_split(tmp_path):
"""Verify that split=None returns a StableDatasetDict.

NOTE: This test also downloads data. Marked as large.
"""
pytest.skip("Skipping: requires full download. Run with -m large manually.")


def test_librispeech_source_contract():
"""Verify that LibriSpeech's SOURCE metadata is well-formed (no download needed)."""
# These checks run at class-definition time via __init_subclass__,
# so if we get here, the class was defined correctly.
assert hasattr(LibriSpeech, "VERSION")
assert hasattr(LibriSpeech, "SOURCE")
assert "homepage" in LibriSpeech.SOURCE
assert "citation" in LibriSpeech.SOURCE
assert "assets" in LibriSpeech.SOURCE
assert "train" in LibriSpeech.SOURCE["assets"]
assert "test" in LibriSpeech.SOURCE["assets"]

print("LibriSpeech SOURCE contract test passed!")
3 changes: 3 additions & 0 deletions stable_datasets/timeseries/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,6 @@
# https://github.com/YashNita/Animal-Sound-Dataset
# pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat', sep="\t", names = ['Frequency','Angle of attack','Chord length','Free-stream velocity','Suction/side','Scaled/sound'])
# https://dagshub.com/DagsHub/audio-datasets/src/main/voice_gender_detection
from .librispeech import LibriSpeech

__all__ = ["LibriSpeech"]
124 changes: 124 additions & 0 deletions stable_datasets/timeseries/librispeech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import io
import tarfile

import numpy as np

from stable_datasets.schema import DatasetInfo, Features, Sequence, Value, Version
from stable_datasets.utils import BaseDatasetBuilder


class LibriSpeech(BaseDatasetBuilder):
"""Automatic Speech Recognition / Speaker Classification.

`LibriSpeech <https://www.openslr.org/12>`_ is a corpus of approximately 1000
hours of 16kHz read English speech, derived from audiobooks in the LibriVox
project. This builder loads the **train-clean-100** subset (100 hours, ~28.5k
utterances) and the **test-clean** subset (~2.6k utterances).

Each example contains the raw waveform (as float32 samples), the speaker ID,
the transcript text, and the sample rate.

Requires the ``soundfile`` package (``pip install soundfile``).
"""

VERSION = Version("1.0.0")

SOURCE = {
"homepage": "https://www.openslr.org/12",
"assets": {
"train": "https://www.openslr.org/resources/12/train-clean-100.tar.gz",
"test": "https://www.openslr.org/resources/12/test-clean.tar.gz",
},
"citation": """@inproceedings{panayotov2015librispeech,
title={Librispeech: an {ASR} corpus based on public domain audio books},
author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
booktitle={2015 IEEE International Conference on Acoustics, Speech and
Signal Processing (ICASSP)},
pages={5206--5210},
year={2015},
organization={IEEE}}""",
}

def _info(self):
return DatasetInfo(
description=(
"LibriSpeech is a corpus of approximately 1000 hours of 16kHz "
"read English speech derived from audiobooks. This builder "
"provides the train-clean-100 and test-clean subsets."
),
features=Features(
{
"audio": Sequence(Value("float32")),
"sample_rate": Value("int32"),
"speaker_id": Value("int64"),
"transcript": Value("string"),
}
),
supervised_keys=("audio", "transcript"),
homepage=self.SOURCE["homepage"],
citation=self.SOURCE["citation"],
)

def _generate_examples(self, data_path, split):
"""Generate examples from the LibriSpeech tar.gz archive.

The archive structure is::

LibriSpeech/<subset>/
<speaker_id>/
<chapter_id>/
<speaker_id>-<chapter_id>-<utterance_id>.flac
<speaker_id>-<chapter_id>.trans.txt
"""
try:
import soundfile as sf
except ImportError:
raise ImportError(
"LibriSpeech requires the 'soundfile' package. "
"Install it with: pip install soundfile"
)

# First pass: collect all transcripts from .trans.txt files
transcripts = {}
with tarfile.open(data_path, "r:gz") as tar:
for member in tar.getmembers():
if member.name.endswith(".trans.txt"):
f = tar.extractfile(member)
if f is None:
continue
for line in f.read().decode("utf-8").strip().splitlines():
parts = line.split(" ", 1)
if len(parts) == 2:
utterance_id, text = parts
transcripts[utterance_id] = text

# Second pass: read audio files and pair with transcripts
idx = 0
with tarfile.open(data_path, "r:gz") as tar:
for member in tar.getmembers():
if not member.name.endswith(".flac"):
continue

f = tar.extractfile(member)
if f is None:
continue

# Read FLAC audio via soundfile
audio_bytes = f.read()
audio_data, sample_rate = sf.read(io.BytesIO(audio_bytes))

# Extract utterance ID and speaker ID from the file path
# Path: LibriSpeech/<subset>/<speaker>/<chapter>/<spk>-<chap>-<utt>.flac
filename = member.name.rsplit("/", 1)[-1]
utterance_id = filename.replace(".flac", "")
speaker_id = int(utterance_id.split("-")[0])

transcript = transcripts.get(utterance_id, "")

yield idx, {
"audio": audio_data.astype(np.float32).tolist(),
"sample_rate": sample_rate,
"speaker_id": speaker_id,
"transcript": transcript,
}
idx += 1