galilai-group · Anurag9Dhiman · Apr 16, 2026
diff --git a/stable_datasets/tests/timeseries/test_librispeech.py b/stable_datasets/tests/timeseries/test_librispeech.py
@@ -0,0 +1,85 @@
+import pytest
+
+from stable_datasets.timeseries.librispeech import LibriSpeech
+
+
+@pytest.mark.large
+def test_librispeech_train_split():
+    """Test the train-clean-100 split of LibriSpeech.
+
+    This test downloads ~6.3GB of data and may take several minutes.
+    Run with: pytest -m large
+    """
+    ds = LibriSpeech(split="train")
+
+    # Test 1: Dataset should have ~28,539 utterances in train-clean-100
+    assert len(ds) > 25000, f"Expected >25,000 training samples, got {len(ds)}."
+
+    # Test 2: Check that each sample has the expected keys
+    sample = ds[0]
+    expected_keys = {"audio", "sample_rate", "speaker_id", "transcript"}
+    assert set(sample.keys()) == expected_keys, (
+        f"Expected keys {expected_keys}, got {set(sample.keys())}"
+    )
+
+    # Test 3: Validate audio type (should be a list of floats)
+    audio = sample["audio"]
+    assert isinstance(audio, list), f"Audio should be a list, got {type(audio)}."
+    assert len(audio) > 0, "Audio waveform should not be empty."
+    assert isinstance(audio[0], float), f"Audio samples should be floats, got {type(audio[0])}."
+
+    # Test 4: Validate sample rate (LibriSpeech is 16kHz)
+    assert sample["sample_rate"] == 16000, (
+        f"Sample rate should be 16000, got {sample['sample_rate']}."
+    )
+
+    # Test 5: Validate speaker_id is a positive integer
+    speaker_id = sample["speaker_id"]
+    assert isinstance(speaker_id, int), f"Speaker ID should be int, got {type(speaker_id)}."
+    assert speaker_id > 0, f"Speaker ID should be positive, got {speaker_id}."
+
+    # Test 6: Validate transcript is a non-empty string
+    transcript = sample["transcript"]
+    assert isinstance(transcript, str), f"Transcript should be a string, got {type(transcript)}."
+    assert len(transcript) > 0, "Transcript should not be empty."
+
+    print(f"All LibriSpeech train tests passed! ({len(ds)} samples)")
+
+
+@pytest.mark.large
+def test_librispeech_test_split():
+    """Test the test-clean split of LibriSpeech."""
+    ds = LibriSpeech(split="test")
+
+    # test-clean has ~2,620 utterances
+    assert len(ds) > 2000, f"Expected >2,000 test samples, got {len(ds)}."
+
+    sample = ds[0]
+    assert "audio" in sample
+    assert "transcript" in sample
+    assert sample["sample_rate"] == 16000
+
+    print(f"All LibriSpeech test tests passed! ({len(ds)} samples)")
+
+
+def test_librispeech_returns_dataset_dict_when_no_split(tmp_path):
+    """Verify that split=None returns a StableDatasetDict.
+
+    NOTE: This test also downloads data. Marked as large.
+    """
+    pytest.skip("Skipping: requires full download. Run with -m large manually.")
+
+
+def test_librispeech_source_contract():
+    """Verify that LibriSpeech's SOURCE metadata is well-formed (no download needed)."""
+    # These checks run at class-definition time via __init_subclass__,
+    # so if we get here, the class was defined correctly.
+    assert hasattr(LibriSpeech, "VERSION")
+    assert hasattr(LibriSpeech, "SOURCE")
+    assert "homepage" in LibriSpeech.SOURCE
+    assert "citation" in LibriSpeech.SOURCE
+    assert "assets" in LibriSpeech.SOURCE
+    assert "train" in LibriSpeech.SOURCE["assets"]
+    assert "test" in LibriSpeech.SOURCE["assets"]
+
+    print("LibriSpeech SOURCE contract test passed!")
diff --git a/stable_datasets/timeseries/__init__.py b/stable_datasets/timeseries/__init__.py
@@ -32,3 +32,6 @@
 # https://github.com/YashNita/Animal-Sound-Dataset
 # pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat', sep="\t", names = ['Frequency','Angle of attack','Chord length','Free-stream velocity','Suction/side','Scaled/sound'])
 # https://dagshub.com/DagsHub/audio-datasets/src/main/voice_gender_detection
+from .librispeech import LibriSpeech
+
+__all__ = ["LibriSpeech"]
diff --git a/stable_datasets/timeseries/librispeech.py b/stable_datasets/timeseries/librispeech.py
@@ -0,0 +1,124 @@
+import io
+import tarfile
+
+import numpy as np
+
+from stable_datasets.schema import DatasetInfo, Features, Sequence, Value, Version
+from stable_datasets.utils import BaseDatasetBuilder
+
+
+class LibriSpeech(BaseDatasetBuilder):
+    """Automatic Speech Recognition / Speaker Classification.
+
+    `LibriSpeech <https://www.openslr.org/12>`_ is a corpus of approximately 1000
+    hours of 16kHz read English speech, derived from audiobooks in the LibriVox
+    project. This builder loads the **train-clean-100** subset (100 hours, ~28.5k
+    utterances) and the **test-clean** subset (~2.6k utterances).
+
+    Each example contains the raw waveform (as float32 samples), the speaker ID,
+    the transcript text, and the sample rate.
+
+    Requires the ``soundfile`` package (``pip install soundfile``).
+    """
+
+    VERSION = Version("1.0.0")
+
+    SOURCE = {
+        "homepage": "https://www.openslr.org/12",
+        "assets": {
+            "train": "https://www.openslr.org/resources/12/train-clean-100.tar.gz",
+            "test": "https://www.openslr.org/resources/12/test-clean.tar.gz",
+        },
+        "citation": """@inproceedings{panayotov2015librispeech,
+            title={Librispeech: an {ASR} corpus based on public domain audio books},
+            author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
+            booktitle={2015 IEEE International Conference on Acoustics, Speech and
+                       Signal Processing (ICASSP)},
+            pages={5206--5210},
+            year={2015},
+            organization={IEEE}}""",
+    }
+
+    def _info(self):
+        return DatasetInfo(
+            description=(
+                "LibriSpeech is a corpus of approximately 1000 hours of 16kHz "
+                "read English speech derived from audiobooks. This builder "
+                "provides the train-clean-100 and test-clean subsets."
+            ),
+            features=Features(
+                {
+                    "audio": Sequence(Value("float32")),
+                    "sample_rate": Value("int32"),
+                    "speaker_id": Value("int64"),
+                    "transcript": Value("string"),
+                }
+            ),
+            supervised_keys=("audio", "transcript"),
+            homepage=self.SOURCE["homepage"],
+            citation=self.SOURCE["citation"],
+        )
+
+    def _generate_examples(self, data_path, split):
+        """Generate examples from the LibriSpeech tar.gz archive.
+
+        The archive structure is::
+
+            LibriSpeech/<subset>/
+                <speaker_id>/
+                    <chapter_id>/
+                        <speaker_id>-<chapter_id>-<utterance_id>.flac
+                        <speaker_id>-<chapter_id>.trans.txt
+        """
+        try:
+            import soundfile as sf
+        except ImportError:
+            raise ImportError(
+                "LibriSpeech requires the 'soundfile' package. "
+                "Install it with: pip install soundfile"
+            )
+
+        # First pass: collect all transcripts from .trans.txt files
+        transcripts = {}
+        with tarfile.open(data_path, "r:gz") as tar:
+            for member in tar.getmembers():
+                if member.name.endswith(".trans.txt"):
+                    f = tar.extractfile(member)
+                    if f is None:
+                        continue
+                    for line in f.read().decode("utf-8").strip().splitlines():
+                        parts = line.split(" ", 1)
+                        if len(parts) == 2:
+                            utterance_id, text = parts
+                            transcripts[utterance_id] = text
+
+        # Second pass: read audio files and pair with transcripts
+        idx = 0
+        with tarfile.open(data_path, "r:gz") as tar:
+            for member in tar.getmembers():
+                if not member.name.endswith(".flac"):
+                    continue
+
+                f = tar.extractfile(member)
+                if f is None:
+                    continue
+
+                # Read FLAC audio via soundfile
+                audio_bytes = f.read()
+                audio_data, sample_rate = sf.read(io.BytesIO(audio_bytes))
+
+                # Extract utterance ID and speaker ID from the file path
+                # Path: LibriSpeech/<subset>/<speaker>/<chapter>/<spk>-<chap>-<utt>.flac
+                filename = member.name.rsplit("/", 1)[-1]
+                utterance_id = filename.replace(".flac", "")
+                speaker_id = int(utterance_id.split("-")[0])
+
+                transcript = transcripts.get(utterance_id, "")
+
+                yield idx, {
+                    "audio": audio_data.astype(np.float32).tolist(),
+                    "sample_rate": sample_rate,
+                    "speaker_id": speaker_id,
+                    "transcript": transcript,
+                }
+                idx += 1