corpus#

(s3prl.dataio.corpus)

Parse the commonly used corpus into standardized dictionary structure

s3prl.dataio.corpus.base

s3prl.dataio.corpus.fluent_speech_commands

Parse the Fluent Speech Command corpus

s3prl.dataio.corpus.iemocap

Parse the IEMOCAP corpus

s3prl.dataio.corpus.librilight

Parse the LibriLight corpus

s3prl.dataio.corpus.librispeech

Parse the LibriSpeech corpus

s3prl.dataio.corpus.quesst14

Parse the QUESST14 corpus

s3prl.dataio.corpus.snips

Parse the Audio SNIPS corpus

s3prl.dataio.corpus.speech_commands

Parse the Google Speech Commands V1 corpus

s3prl.dataio.corpus.voxceleb1sid

Parse VoxCeleb1 corpus for classification

s3prl.dataio.corpus.voxceleb1sv

Parse VoxCeleb1 corpus for verification

FluentSpeechCommands#

class s3prl.dataio.corpus.FluentSpeechCommands(dataset_root: str, n_jobs: int = 4)[source][source]#

Bases: Corpus

Parse the Fluent Speech Command dataset

Parameters:

dataset_root – (str) The dataset root of Fluent Speech Command

property all_data[source]#

Return all the data points in a dict of the format

data_id1:
    path: (str) The waveform path
    speakerId: (str) The speaker name
    transcription: (str) The transcription
    action: (str) The action
    object: (str) The action's targeting object
    location: (str) The location where the action happens

data_id2:
    ...
property data_split[source]#

Return a list:

train_data, valid_data, test_data

each is a dict following the format specified in all_data

property data_split_ids[source]#

Return a list:

train_ids, valid_ids, test_ids

Each is a list containing data_ids. data_ids can be used as the key to access the all_data

classmethod download_dataset(tgt_dir: str) None[source][source]#

Download and unzip the dataset to tgt_dir/fluent_speech_commands_dataset

Parameters:

tgt_dir (str) – The root directory containing many different datasets

static dataframe_to_datapoints(df: DataFrame, unique_name_fn: callable)[source]#

IEMOCAP#

class s3prl.dataio.corpus.IEMOCAP(dataset_root: str, n_jobs: int = 4)[source][source]#

Bases: Corpus

Parse the IEMOCAP dataset

Parameters:

dataset_root – (str) The dataset root of IEMOCAP

property all_data[source]#

Return: dict

all the data points of IEMOCAP in the format of

data_id1:
    wav_path (str): The waveform path
    speaker (str): The speaker name
    act (str): improvised / scripted
    emotion (str): The emotion label
    session_id (int): The session

data_id2:
    ...
get_whole_session(session_id: int)[source][source]#
Parameters:

session_id (int) – The session index selected from 1, 2, 3, 4, 5

Returns:

dict

data points in a single session (containing improvised and scripted recordings) in the same format as all_data

get_session_with_act(session_id: int, act: str)[source][source]#
Parameters:
  • session_id (int) – The session index selected from 1, 2, 3, 4, 5

  • act (str) – ‘improvised’ or ‘scripted’

Returns:

s3prl.base.container.Container

data points in a single session with a specific act (either improvised or scripted) in the same format as all_data

classmethod download_dataset(tgt_dir: str) None[source][source]#
property data_split[source]#
abstract property data_split_ids[source]#
static dataframe_to_datapoints(df: DataFrame, unique_name_fn: callable)[source]#

LibriSpeech#

class s3prl.dataio.corpus.LibriSpeech(dataset_root: str, n_jobs: int = 4, train_split: List[str] = ['train-clean-100'], valid_split: List[str] = ['dev-clean'], test_split: List[str] = ['test-clean'])[source][source]#

Bases: Corpus

LibriSpeech Corpus Link: https://www.openslr.org/12

Parameters:
  • dataset_root (str) – Path to LibriSpeech corpus directory.

  • n_jobs (int, optional) – Number of jobs. Defaults to 4.

  • train_split (List[str], optional) – Training splits. Defaults to [“train-clean-100”].

  • valid_split (List[str], optional) – Validation splits. Defaults to [“dev-clean”].

  • test_split (List[str], optional) – Testing splits. Defaults to [“test-clean”].

get_corpus_splits(splits: List[str])[source][source]#
property all_data[source]#

Return all the data points in a dict of the format

data_id1:
    wav_path: (str) The waveform path
    transcription: (str) The transcription
    speaker: (str) The speaker name
    gender: (str) The speaker's gender
    corpus_split: (str) The split of corpus this sample belongs to

data_id2:
    ...
property data_split_ids[source]#
classmethod download_dataset(target_dir: str, splits: List[str] = ['train-clean-100', 'dev-clean', 'test-clean']) None[source][source]#
property data_split[source]#
static dataframe_to_datapoints(df: DataFrame, unique_name_fn: callable)[source]#

LibriLight#

class s3prl.dataio.corpus.LibriLight(dataset_root: str, n_jobs: int = 4, train_split: str = '10m-fold0')[source][source]#

Bases: Corpus

classmethod download_dataset(dataset_root: str)[source][source]#
property all_data[source]#
property data_split[source]#
abstract property data_split_ids[source]#
static dataframe_to_datapoints(df: DataFrame, unique_name_fn: callable)[source]#

Quesst14#

class s3prl.dataio.corpus.Quesst14(dataset_root: str)[source][source]#

Bases: object

property valid_queries[source]#
property test_queries[source]#
property docs[source]#

Valid and Test share the same document database

classmethod download_dataset(tgt_dir: str) None[source][source]#

SNIPS#

class s3prl.dataio.corpus.SNIPS(dataset_root: str, train_speakers: List[str], valid_speakers: List[str], test_speakers: List[str])[source][source]#

Bases: Corpus

property all_data[source]#
property data_split_ids[source]#
property data_split[source]#
static dataframe_to_datapoints(df: DataFrame, unique_name_fn: callable)[source]#

SpeechCommandsV1#

class s3prl.dataio.corpus.SpeechCommandsV1(gsc1: str, gsc1_test: str, n_jobs: int = 4)[source][source]#

Bases: Corpus

Parameters:

dataset_root (str) – should contain a ‘dev’ sub-folder for the training/validation set and a ‘test’ sub-folder for the testing set

static split_dataset(root_dir: Union[str, Path], max_uttr_per_class=134217727) Tuple[List[Tuple[str, str]], List[Tuple[str, str]]][source][source]#

Split Speech Commands into 3 set.

Parameters:
  • root_dir – speech commands dataset root dir

  • max_uttr_per_class – predefined value in the original paper

Returns:

[(class_name, audio_path), …] valid_list: as above

Return type:

train_list

static parse_train_valid_data_list(data_list, train_dataset_root: Path)[source][source]#
static parse_test_data_list(test_dataset_root: Path)[source][source]#
static path_to_unique_name(path: str)[source][source]#
classmethod list_to_dict(data_list)[source][source]#
property all_data[source]#

Return: Container: id (str)

wav_path (str) class_name (str)

property data_split_ids[source]#
classmethod download_dataset(tgt_dir: str) None[source][source]#
property data_split[source]#
static dataframe_to_datapoints(df: DataFrame, unique_name_fn: callable)[source]#

VoxCeleb1SID#

class s3prl.dataio.corpus.VoxCeleb1SID(dataset_root: str, n_jobs: int = 4, cache_root: str = PosixPath('/home/runner/.cache/s3prl'))[source][source]#

Bases: Corpus

property all_data[source]#
property data_split_ids[source]#
classmethod download_dataset(target_dir: str, splits: List[str] = ['dev', 'test']) None[source][source]#
property data_split[source]#
static dataframe_to_datapoints(df: DataFrame, unique_name_fn: callable)[source]#

VoxCeleb1SV#

class s3prl.dataio.corpus.VoxCeleb1SV(dataset_root: str, download_dir: str, force_download: bool = True)[source][source]#

Bases: Corpus

classmethod path2uid(path)[source][source]#
classmethod path2data(paths, speakerid2label)[source][source]#
static format_path(dataset_root, download_dir, force_download: bool)[source][source]#
classmethod format_test_trials(download_dir: str, force_download: bool)[source][source]#
property all_data[source]#
property data_split_ids[source]#
property data_split[source]#
static dataframe_to_datapoints(df: DataFrame, unique_name_fn: callable)[source]#