Source code for s3prl.dataio.corpus.librilight

"""
Parse the LibriLight corpus

Authors:
  * Heng-Jui Chang 2022
"""

import logging
import os
import re
import subprocess
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List

from joblib import Parallel, delayed

from s3prl.util.download import _urls_to_filepaths

from .base import Corpus

LIBRILIGHT_SPLITS = [
    "10h",
    "1h",
    "10m-fold0",
    "10m-fold1",
    "10m-fold2",
    "10m-fold3",
    "10m-fold4",
    "10m-fold5",
]
LIBRISPEECH_SPKR_INFO = (
    "https://huggingface.co/datasets/s3prl/librispeech_metadata/raw/main/SPEAKERS.TXT"
)


__all__ = [
    "LibriLight",
]


def read_text(file: Path) -> str:
    src_file = "-".join(str(file).split("-")[:-1]) + ".trans.txt"
    idx = file.stem.replace(".flac", "")

    with open(src_file, "r") as fp:
        for line in fp:
            if idx == line.split(" ")[0]:
                return line[:-1].split(" ", 1)[1]

    logging.warning(f"Transcription of {file} not found!")


def check_no_repeat(splits: List[str]) -> bool:
    count = defaultdict(int)
    for split in splits:
        count[split] += 1

    repeated = ""
    for key, val in count.items():
        if val > 1:
            repeated += f" {key} ({val} times)"

    if len(repeated) != 0:
        logging.warning(
            f"Found repeated splits in corpus: {repeated}, which might cause unexpected behaviors."
        )
        return False

    return True


def _parse_spk_to_gender(speaker_file: Path) -> dict:
    speaker_file = Path(speaker_file)
    with speaker_file.open() as file:
        lines = [line.strip() for line in file.readlines()]
    for line_id in range(len(lines)):
        line = lines[line_id]
        if "SEX" in line and "SUBSET" in line and "MINUTES" in line and "NAME" in line:
            break

    line_id += 1  # first line with speaker info
    spk2gender = {}
    for line_id in range(line_id, len(lines)):
        line = lines[line_id]
        line = re.sub("\t+", " ", line)
        line = re.sub(" +", " ", line)
        parts = line.split("|", maxsplit=4)
        ID, SEX, SUBSET, MINUTES, NAME = parts
        spk2gender[int(ID)] = SEX.strip()
    return spk2gender


[docs]class LibriLight(Corpus):
    def __init__(
        self,
        dataset_root: str,
        n_jobs: int = 4,
        train_split: str = "10m-fold0",
    ) -> None:
        self.dataset_root = Path(dataset_root).resolve()
        self.train_split = train_split

        if train_split == "10h":
            roots = [self.dataset_root / "1h", self.dataset_root / "9h"]
        elif train_split == "1h":
            roots = [self.dataset_root / "1h"]
        elif train_split.startswith("10m"):
            fold_id = int(train_split.split("-")[-1].split("fold")[-1])
            roots = [self.dataset_root / "1h" / str(fold_id)]
        else:
            raise ValueError(f"Unsupported split: {train_split}")

        self._data = self._collect_data(roots, n_jobs)

[docs]    @classmethod
    def download_dataset(cls, dataset_root: str):
        Path(dataset_root).mkdir(parents=True, exist_ok=True)
        subprocess.check_call(
            [
                "wget",
                "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz",
                "-O",
                str(Path(dataset_root) / "librispeech_finetuning.tgz"),
            ]
        )
        subprocess.check_call(
            ["tar", "zxvf", "librispeech_finetuning.tgz", "-C", str(Path(dataset_root))]
        )

    @property
    def all_data(self):
        return self._data

    @staticmethod
    def _collect_data(
        roots: List[Path], n_jobs: int = 4
    ) -> Dict[str, Dict[str, List[Any]]]:
        spkr_file = _urls_to_filepaths(LIBRISPEECH_SPKR_INFO)
        spkr2gender = _parse_spk_to_gender(Path(spkr_file).resolve())
        data_dict = {}
        for split_dir in roots:
            if not os.path.exists(split_dir):
                logging.info(
                    f"Split {split_dir} is not downloaded. Skip data collection."
                )
                continue

            wav_list = list(Path(split_dir).rglob("*.flac"))
            name_list = [file.stem.replace(".flac", "") for file in wav_list]
            text_list = Parallel(n_jobs=n_jobs)(
                delayed(read_text)(file) for file in wav_list
            )
            spkr_list = [int(name.split("-")[0]) for name in name_list]

            for wav_id in range(len(wav_list)):
                wav = Path(wav_list[wav_id])
                data_dict[wav.stem] = {
                    "wav_path": str(wav.resolve()),
                    "transcription": text_list[wav_id],
                    "speaker": spkr_list[wav_id],
                    "gender": spkr2gender[spkr_list[wav_id]],
                }
        return data_dict