galilai-group · snkv04 · Feb 8, 2026 · Feb 8, 2026 · Feb 8, 2026 · Feb 8, 2026
diff --git a/.github/workflows/testing.yaml b/.github/workflows/testing.yaml
@@ -40,6 +40,12 @@ jobs:
           python-version: "3.10"
           cache: "pip"
 
+      # Install system dependencies
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y ffmpeg
+
       # Install dependencies
       - name: Install dependencies
         run: |

diff --git a/docs/source/datasets/clotho.rst b/docs/source/datasets/clotho.rst
@@ -0,0 +1,102 @@
+Clotho
+==========
+
+.. raw:: html
+
+   <p style="display: flex; gap: 10px;">
+   <img src="https://img.shields.io/badge/Task-Audio%20Captioning-blue" alt="Task: Audio Captioning">
+   <img src="https://img.shields.io/badge/Captions%20per%20Audio%20Sample-5-green" alt="Captions per Audio Sample: 5">
+   <img src="https://img.shields.io/badge/Caption%20Length-8%20to%2020%20words-orange" alt="Caption Length: 8 to 20 words">
+   </p>
+
+Overview
+--------
+
+The Clotho dataset contains audio samples with a wide variety of general audio content, such as  wind blowing, TV static, paper shredding, etc. The total dataset contains 4,981 audio samples, and it has a "development" (train), "evaluation" (validation), and testing split. However, the testing split is not available here in ``stable-datasets``, as it is withheld by the dataset's creators for "for potential usage in scientific challenges". Every audio sample has exactly 5 captions, each ranging from 8 to 20 words, describing the audio content of the sample in free-form English text. Below are the dataset sizes:
+
+- **Train**: 2,893 samples (14,465 total captions)
+- **Validation**: 1,045 samples (5,225 total captions)
+
+A sample audio clip from the dataset can be heard at ``docs/source/datasets/teasers/clotho_teaser.wav``.
+
+Data Structure
+--------------
+
+When accessing an example using ``ds[i]``, you will receive a dictionary with the following keys:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 20 20 60
+
+   * - Key
+     - Type
+     - Description
+   * - ``audio``
+     - ``torchcodec.decoders.AudioDecoder``
+     - Audio clip loaded from a .wav file
+   * - ``captions``
+     - ``list[str]``
+     - List of 5 captions
+   * - ``keywords``
+     - ``list[str]``
+     - List of keywords relevant to the audio clip
+   * - ``freesound_id``
+     - ``int``
+     - ID of original audio clip from Freesound
+   * - ``freesound_link``
+     - ``str``
+     - URL to original audio clip from Freesound
+   * - ``start_sample``
+     - ``int``
+     - Sample from the original Freesound audio that this audio clip starts from
+   * - ``end_sample``
+     - ``int``
+     - Sample from the original Freesound audio that this audio clip ends at
+   * - ``manufacturer``
+     - ``str``
+     - The Freesound user who published the audio
+   * - ``license``
+     - ``str``
+     - The license that the audio is published under
+
+
+Usage Example
+-------------
+
+.. code-block:: python
+
+    from stable_datasets.timeseries.clotho import Clotho
+
+    # First run will download + prepare cache, then return the split as a HF Dataset
+    ds = Clotho(split="train")
+
+    # Can access attributes of each sample through standard Python dict indexing
+    sample = ds[0]
+    print(sample.keys())
+    print(f"Captions: {sample['captions']}")
+    print(f"Keywords: {sample['keywords']}")
+
+    # Optional: make it PyTorch-friendly
+    ds_torch = ds.with_format("torch")
+
+References
+----------
+
+- Homepage: https://github.com/audio-captioning/clotho-dataset
+
+Citation
+--------
+
+.. code-block:: bibtex
+
+    @inproceedings{9052990,
+        author={Drossos, Konstantinos and Lipping, Samuel and Virtanen, Tuomas},
+        booktitle={ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
+        title={Clotho: an Audio Captioning Dataset}, 
+        year={2020},
+        volume={},
+        number={},
+        pages={736-740},
+        keywords={Training;Conferences;Employment;Signal processing;Task analysis;Speech processing;Tuning;audio captioning;dataset;Clotho},
+        doi={10.1109/ICASSP40776.2020.9052990}
+    }
diff --git a/docs/source/datasets/index.rst b/docs/source/datasets/index.rst
@@ -65,5 +65,11 @@ Available Datasets
    rock_paper_scissor
    linnaeus5
 
+.. toctree::
+   :maxdepth: 1
+   :caption: Time-Series Datasets
+
+   clotho
+
 .. note::
    Documentation is being added progressively, as datasets are ready for usage. Please only use datasets found in the documentation.
diff --git a/docs/source/datasets/teasers/clotho_teaser.wav b/docs/source/datasets/teasers/clotho_teaser.wav
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,6 +29,8 @@ dependencies = [
     "h5py",
     "requests_cache",
     "pre-commit>=4.5.0",
+    "py7zr",
+    "torchcodec",
 ]
 
 dynamic = ["version"]

diff --git a/stable_datasets/tests/timeseries/test_clotho.py b/stable_datasets/tests/timeseries/test_clotho.py
@@ -0,0 +1,112 @@
+import os
+import re
+
+from loguru import logger as logging
+from torchcodec.decoders import AudioDecoder
+
+from stable_datasets.timeseries.clotho import Clotho
+
+
+def test_clotho_dataset():
+    # Test 1: Checks number of samples
+    download_dir = f"/cs/data/people/{os.getenv('USER')}/.stable_datasets/downloads"
+    processed_cache_dir = f"/cs/data/people/{os.getenv('USER')}/.stable_datasets/processed"
+    clotho_train = Clotho(
+        split="train",
+        download_dir=download_dir,
+        processed_cache_dir=processed_cache_dir,
+    )
+    expected_num_samples = 2893
+    assert len(clotho_train) == expected_num_samples, (
+        f"Expected {expected_num_samples} training samples, got {len(clotho_train)}."
+    )
+
+    # Test 2: Checks sample keys
+    sample = clotho_train[0]
+    expected_keys = {
+        "audio",
+        "captions",
+        "keywords",
+        "freesound_id",
+        "freesound_link",
+        "start_sample",
+        "end_sample",
+        "manufacturer",
+        "license",
+    }
+    assert set(sample.keys()) == expected_keys, f"Expected keys {expected_keys}, got {set(sample.keys())}"
+
+    # Test 3: Checks sample value types
+    audio = sample["audio"]
+    assert isinstance(audio, AudioDecoder), f"Audio field should be an AudioDecoder, got {type(audio)}."
+
+    captions = sample["captions"]
+    assert isinstance(captions, list), f"Captions field should be a list, got {type(captions)}."
+    for caption in captions:
+        assert isinstance(caption, str), f"Each caption should be a string, got {type(caption)}."
+
+    keywords = sample["keywords"]
+    assert isinstance(keywords, list), f"Keywords field should be a list, got {type(keywords)}."
+    for keyword in keywords:
+        assert isinstance(keyword, str), f"Each keyword should be a string, got {type(keyword)}."
+
+    freesound_id = sample["freesound_id"]
+    assert isinstance(freesound_id, int) or freesound_id is None, (
+        f"Freesound ID field should be an integer or None, got {type(freesound_id)}."
+    )
+    freesound_link = sample["freesound_link"]
+    assert isinstance(freesound_link, str) or freesound_link is None, (
+        f"Freesound link field should be a string or None, got {type(freesound_link)}."
+    )
+    start_sample = sample["start_sample"]
+    assert isinstance(start_sample, int), f"Start sample field should be an integer, got {type(start_sample)}."
+    end_sample = sample["end_sample"]
+    assert isinstance(end_sample, int), f"End sample field should be an integer, got {type(end_sample)}."
+    manufacturer = sample["manufacturer"]
+    assert isinstance(manufacturer, str), f"Manufacturer field should be a string, got {type(manufacturer)}."
+    license = sample["license"]
+    assert isinstance(license, str), f"License field should be a string, got {type(license)}."
+
+    # Test 4: Checks sample value properties
+    expected_sample_rate = 44100
+    assert audio.metadata.sample_rate == expected_sample_rate, (
+        f"Audio should be at {expected_sample_rate} Hz, got {audio.metadata.sample_rate}."
+    )
+    audio_data = audio.get_all_samples().data
+    assert len(audio_data.shape) == 2, f"Audio data should have 2 dimensions, got {len(audio_data.shape)}."
+    expected_num_channels = 1
+    assert audio_data.shape[0] == expected_num_channels, (
+        f"Audio data should have {expected_num_channels} channels, got {audio_data.shape[0]}."
+    )
+    assert (start_sample is None and end_sample is None) or (
+        start_sample is not None and end_sample is not None and start_sample < end_sample
+    ), f"Start sample should be less than end sample, got {start_sample} and {end_sample}."
+
+    assert len(captions) == 5, f"Captions field should have 5 elements, got {len(captions)}."
+    for caption in captions:
+        num_words = len(caption.split())
+        assert num_words >= 8 and num_words <= 20, f"Each caption should have between 8 and 20 words, got {num_words}."
+    assert len(keywords) > 0, "Keywords field should not be empty."
+
+    if not (freesound_id is None and freesound_link is None):
+        assert isinstance(freesound_id, int), f"Freesound ID field should be an integer, got {type(freesound_id)}."
+        assert isinstance(freesound_link, str), f"Freesound link field should be a string, got {type(freesound_link)}."
+        pattern = rf"^https://freesound\.org/people/[^/]+/sounds/{freesound_id}$"
+        assert re.match(pattern, freesound_link), f"Freesound link should match the pattern, got {freesound_link}."
+
+    # Test 5: Checks number of samples in validation split
+    clotho_validation = Clotho(
+        split="validation",
+        download_dir=download_dir,
+        processed_cache_dir=processed_cache_dir,
+    )
+    expected_num_samples = 1045
+    assert len(clotho_validation) == expected_num_samples, (
+        f"Expected {expected_num_samples} validation samples, got {len(clotho_validation)}."
+    )
+
+    logging.info("All Clotho dataset tests passed successfully!")
+
+
+if __name__ == "__main__":
+    test_clotho_dataset()
diff --git a/stable_datasets/timeseries/__init__.py b/stable_datasets/timeseries/__init__.py
@@ -1,5 +1,12 @@
 #!/usr/bin/env python
 
+from .clotho import Clotho
+
+
+__all__ = [
+    "Clotho",
+]
+
 # from . import (
 #    VoiceGenderDetection,
 #    JapaneseVowels,