Source code for s3prl.dataio.corpus.fluent_speech_commands

"""
Parse the Fluent Speech Command corpus

Authors:
  * Leo 2022
  * Cheng Liang 2022
"""

import logging
from collections import OrderedDict
from pathlib import Path

import pandas as pd

from .base import Corpus

logger = logging.getLogger(__name__)

__all__ = [
    "FluentSpeechCommands",
]


[docs]class FluentSpeechCommands(Corpus):
    """
    Parse the Fluent Speech Command dataset

    Args:
        dataset_root: (str) The dataset root of Fluent Speech Command
    """

    def __init__(self, dataset_root: str, n_jobs: int = 4) -> None:
        self.dataset_root = Path(dataset_root)
        self.train = self.dataframe_to_datapoints(
            pd.read_csv(self.dataset_root / "data" / "train_data.csv"),
            self._get_unique_name,
        )
        self.valid = self.dataframe_to_datapoints(
            pd.read_csv(self.dataset_root / "data" / "valid_data.csv"),
            self._get_unique_name,
        )
        self.test = self.dataframe_to_datapoints(
            pd.read_csv(self.dataset_root / "data" / "test_data.csv"),
            self._get_unique_name,
        )

        data_points = OrderedDict()
        data_points.update(self.train)
        data_points.update(self.valid)
        data_points.update(self.test)
        data_points = {key: self._parse_data(data) for key, data in data_points.items()}
        self._all_data = data_points

    @staticmethod
    def _get_unique_name(data_point):
        return Path(data_point["path"]).stem

    def _parse_data(self, data):
        return dict(
            path=self.dataset_root / data["path"],
            speakerId=data["speakerId"],
            transcription=data["transcription"],
            action=data["action"],
            object=data["object"],
            location=data["location"],
        )

    @property
    def all_data(self):
        """
        Return all the data points in a dict of the format

        .. code-block:: yaml

            data_id1:
                path: (str) The waveform path
                speakerId: (str) The speaker name
                transcription: (str) The transcription
                action: (str) The action
                object: (str) The action's targeting object
                location: (str) The location where the action happens

            data_id2:
                ...
        """
        return self._all_data

    @property
    def data_split(self):
        """
        Return a list:

        :code:`train_data`, :code:`valid_data`, :code:`test_data`

        each is a dict following the format specified in :obj:`all_data`
        """
        return super().data_split

    @property
    def data_split_ids(self):
        """
        Return a list:

        :code:`train_ids`, :code:`valid_ids`, :code:`test_ids`

        Each is a list containing data_ids. data_ids can be used as the key to access the :obj:`all_data`
        """
        return list(self.train.keys()), list(self.valid.keys()), list(self.test.keys())

[docs]    @classmethod
    def download_dataset(cls, tgt_dir: str) -> None:
        """
        Download and unzip the dataset to :code:`tgt_dir`/fluent_speech_commands_dataset

        Args:
            tgt_dir (str): The root directory containing many different datasets
        """
        import os
        import tarfile

        import requests

        tgt_dir = Path(tgt_dir)
        tgt_dir.mkdir(exists_ok=True, parents=True)

        def unzip_targz_then_delete(filepath: str):
            with tarfile.open(os.path.abspath(filepath)) as tar:
                tar.extractall(path=os.path.abspath(tgt_dir))
            os.remove(os.path.abspath(filepath))

        def download_from_url(url: str):
            filename = url.split("/")[-1].replace(" ", "_")
            filepath = os.path.join(tgt_dir, filename)

            r = requests.get(url, stream=True)
            if r.ok:
                logger.info(f"Saving {filename} to", os.path.abspath(filepath))
                with open(filepath, "wb") as f:
                    for chunk in r.iter_content(chunk_size=1024 * 1024 * 10):
                        if chunk:
                            f.write(chunk)
                            f.flush()
                            os.fsync(f.fileno())
                logger.info(f"{filename} successfully downloaded")
                unzip_targz_then_delete(filepath)
            else:
                logger.info(f"Download failed: status code {r.status_code}\n{r.text}")

        if not (
            os.path.exists(
                os.path.join(
                    os.path.abspath(tgt_dir), "fluent_speech_commands_dataset/wavs"
                )
            )
            and os.path.exists(
                os.path.join(
                    os.path.abspath(tgt_dir),
                    "fluent_speech_commands_dataset/data/speakers",
                )
            )
        ):
            download_from_url("http://140.112.21.28:9000/fluent.tar.gz")
        logger.info(
            f"Fluent speech commands dataset downloaded. Located at {os.path.abspath(tgt_dir)}/fluent_speech_commands_dataset/"
        )