Source code for s3prl.dataio.corpus.quesst14

"""
Parse the QUESST14 corpus

Authors:
  * Leo 2022
  * Cheng Liang 2022
"""

import logging
import re
from pathlib import Path

logger = logging.getLogger(__name__)

__all__ = [
    "Quesst14",
]


[docs]class Quesst14: def __init__(self, dataset_root: str): dataset_root = Path(dataset_root) self.doc_paths = self._english_audio_paths( dataset_root, "language_key_utterances.lst" ) self.dev_query_paths = self._english_audio_paths( dataset_root, f"language_key_dev.lst" ) self.eval_query_paths = self._english_audio_paths( dataset_root, f"language_key_eval.lst" ) self.n_dev_queries = len(self.dev_query_paths) self.n_eval_queries = len(self.eval_query_paths) self.n_docs = len(self.doc_paths) @staticmethod def _english_audio_paths(dataset_root_path, lst_name): """Extract English audio paths.""" audio_paths = [] with open(dataset_root_path / "scoring" / lst_name) as f: for line in f: audio_path, lang = tuple(line.strip().split()) if lang != "nnenglish": continue audio_path = re.sub(r"^.*?\/", "", audio_path) audio_paths.append(dataset_root_path / audio_path) return audio_paths @property def valid_queries(self): return self.dev_query_paths @property def test_queries(self): return self.eval_query_paths @property def docs(self): """ Valid and Test share the same document database """ return self.doc_paths
[docs] @classmethod def download_dataset(cls, tgt_dir: str) -> None: import os import tarfile import requests assert os.path.exists( os.path.abspath(tgt_dir) ), "Target directory does not exist" def unzip_targz_then_delete(filepath: str): with tarfile.open(os.path.abspath(filepath)) as tar: tar.extractall(path=os.path.abspath(tgt_dir)) os.remove(os.path.abspath(filepath)) def download_from_url(url: str): filename = url.split("/")[-1].replace(" ", "_") filepath = os.path.join(tgt_dir, filename) r = requests.get(url, stream=True) if r.ok: logger.info(f"Saving {filename} to", os.path.abspath(filepath)) with open(filepath, "wb") as f: for chunk in r.iter_content(chunk_size=1024 * 1024 * 10): if chunk: f.write(chunk) f.flush() os.fsync(f.fileno()) logger.info(f"{filename} successfully downloaded") unzip_targz_then_delete(filepath) else: logger.info(f"Download failed: status code {r.status_code}\n{r.text}") if not os.path.exists( os.path.join(os.path.abspath(tgt_dir), "quesst14Database/") ): download_from_url("https://speech.fit.vutbr.cz/files/quesst14Database.tgz") logger.info( f"Quesst14 dataset downloaded. Located at {os.path.abspath(tgt_dir)}/quesst14Database/" )