corpus#

(s3prl.dataio.corpus)

Parse the commonly used corpus into standardized dictionary structure

`s3prl.dataio.corpus.base`
`s3prl.dataio.corpus.fluent_speech_commands`	Parse the Fluent Speech Command corpus
`s3prl.dataio.corpus.iemocap`	Parse the IEMOCAP corpus
`s3prl.dataio.corpus.librilight`	Parse the LibriLight corpus
`s3prl.dataio.corpus.librispeech`	Parse the LibriSpeech corpus
`s3prl.dataio.corpus.quesst14`	Parse the QUESST14 corpus
`s3prl.dataio.corpus.snips`	Parse the Audio SNIPS corpus
`s3prl.dataio.corpus.speech_commands`	Parse the Google Speech Commands V1 corpus
`s3prl.dataio.corpus.voxceleb1sid`	Parse VoxCeleb1 corpus for classification
`s3prl.dataio.corpus.voxceleb1sv`	Parse VoxCeleb1 corpus for verification

FluentSpeechCommands#

class s3prl.dataio.corpus.FluentSpeechCommands(dataset_root: str, n_jobs: int = 4)[source][source]#

Bases: Corpus

Parse the Fluent Speech Command dataset

Parameters:: dataset_root – (str) The dataset root of Fluent Speech Command

property all_data[source]#

Return all the data points in a dict of the format

data_id1:
    path: (str) The waveform path
    speakerId: (str) The speaker name
    transcription: (str) The transcription
    action: (str) The action
    object: (str) The action's targeting object
    location: (str) The location where the action happens

data_id2:
    ...

property data_split[source]#

Return a list:

train_data, valid_data, test_data

each is a dict following the format specified in all_data

property data_split_ids[source]#

Return a list:

train_ids, valid_ids, test_ids

Each is a list containing data_ids. data_ids can be used as the key to access the all_data

classmethod download_dataset(tgt_dir: str) → None[source][source]#

Download and unzip the dataset to tgt_dir/fluent_speech_commands_dataset

Parameters:: tgt_dir (str) – The root directory containing many different datasets

static dataframe_to_datapoints(df: DataFrame, unique_name_fn: callable)[source]#

IEMOCAP#

class s3prl.dataio.corpus.IEMOCAP(dataset_root: str, n_jobs: int = 4)[source][source]#

Bases: Corpus

Parse the IEMOCAP dataset

Parameters:: dataset_root – (str) The dataset root of IEMOCAP

property all_data[source]#

Return: dict

all the data points of IEMOCAP in the format of

data_id1:
    wav_path (str): The waveform path
    speaker (str): The speaker name
    act (str): improvised / scripted
    emotion (str): The emotion label
    session_id (int): The session

data_id2:
    ...

get_whole_session(session_id: int)[source][source]#

Parameters:

session_id (int) – The session index selected from 1, 2, 3, 4, 5

Returns:

dict

data points in a single session (containing improvised and scripted recordings) in the same format as all_data

get_session_with_act(session_id: int, act: str)[source][source]#

Parameters:

session_id (int) – The session index selected from 1, 2, 3, 4, 5
act (str) – ‘improvised’ or ‘scripted’

Returns:

s3prl.base.container.Container

data points in a single session with a specific act (either improvised or scripted) in the same format as all_data

classmethod download_dataset(tgt_dir: str) → None[source][source]#

property data_split[source]#

abstract property data_split_ids[source]#

static dataframe_to_datapoints(df: DataFrame, unique_name_fn: callable)[source]#

LibriSpeech#

class s3prl.dataio.corpus.LibriSpeech(dataset_root: str, n_jobs: int = 4, train_split: List[str] = ['train-clean-100'], valid_split: List[str] = ['dev-clean'], test_split: List[str] = ['test-clean'])[source][source]#

Bases: Corpus

LibriSpeech Corpus Link: https://www.openslr.org/12

Parameters:

dataset_root (str) – Path to LibriSpeech corpus directory.
n_jobs (int, optional) – Number of jobs. Defaults to 4.
train_split (List[str], optional) – Training splits. Defaults to [“train-clean-100”].
valid_split (List[str], optional) – Validation splits. Defaults to [“dev-clean”].
test_split (List[str], optional) – Testing splits. Defaults to [“test-clean”].

get_corpus_splits(splits: List[str])[source][source]#

property all_data[source]#

Return all the data points in a dict of the format

data_id1:
    wav_path: (str) The waveform path
    transcription: (str) The transcription
    speaker: (str) The speaker name
    gender: (str) The speaker's gender
    corpus_split: (str) The split of corpus this sample belongs to

data_id2:
    ...

property data_split_ids[source]#

classmethod download_dataset(target_dir: str, splits: List[str] = ['train-clean-100', 'dev-clean', 'test-clean']) → None[source][source]#

property data_split[source]#

static dataframe_to_datapoints(df: DataFrame, unique_name_fn: callable)[source]#

LibriLight#

class s3prl.dataio.corpus.LibriLight(dataset_root: str, n_jobs: int = 4, train_split: str = '10m-fold0')[source][source]#

Bases: Corpus

classmethod download_dataset(dataset_root: str)[source][source]#

property all_data[source]#

property data_split[source]#

abstract property data_split_ids[source]#

static dataframe_to_datapoints(df: DataFrame, unique_name_fn: callable)[source]#

Quesst14#

class s3prl.dataio.corpus.Quesst14(dataset_root: str)[source][source]#

Bases: object

property valid_queries[source]#

property test_queries[source]#

property docs[source]#: Valid and Test share the same document database

classmethod download_dataset(tgt_dir: str) → None[source][source]#

SNIPS#

class s3prl.dataio.corpus.SNIPS(dataset_root: str, train_speakers: List[str], valid_speakers: List[str], test_speakers: List[str])[source][source]#

Bases: Corpus

property all_data[source]#

property data_split_ids[source]#

property data_split[source]#

static dataframe_to_datapoints(df: DataFrame, unique_name_fn: callable)[source]#

SpeechCommandsV1#

class s3prl.dataio.corpus.SpeechCommandsV1(gsc1: str, gsc1_test: str, n_jobs: int = 4)[source][source]#

Bases: Corpus

Parameters:: dataset_root (str) – should contain a ‘dev’ sub-folder for the training/validation set and a ‘test’ sub-folder for the testing set

static split_dataset(root_dir: Union[str, Path], max_uttr_per_class=134217727) → Tuple[List[Tuple[str, str]], List[Tuple[str, str]]][source][source]#

Split Speech Commands into 3 set.

Parameters:

root_dir – speech commands dataset root dir
max_uttr_per_class – predefined value in the original paper

Returns:

[(class_name, audio_path), …] valid_list: as above

Return type:

train_list

static parse_train_valid_data_list(data_list, train_dataset_root: Path)[source][source]#

static parse_test_data_list(test_dataset_root: Path)[source][source]#

static path_to_unique_name(path: str)[source][source]#

classmethod list_to_dict(data_list)[source][source]#

property all_data[source]#: Return: Container: id (str)

wav_path (str) class_name (str)

property data_split_ids[source]#

classmethod download_dataset(tgt_dir: str) → None[source][source]#

property data_split[source]#

static dataframe_to_datapoints(df: DataFrame, unique_name_fn: callable)[source]#

VoxCeleb1SID#

class s3prl.dataio.corpus.VoxCeleb1SID(dataset_root: str, n_jobs: int = 4, cache_root: str = PosixPath('/home/runner/.cache/s3prl'))[source][source]#

Bases: Corpus

property all_data[source]#

property data_split_ids[source]#

classmethod download_dataset(target_dir: str, splits: List[str] = ['dev', 'test']) → None[source][source]#

property data_split[source]#

static dataframe_to_datapoints(df: DataFrame, unique_name_fn: callable)[source]#

VoxCeleb1SV#

class s3prl.dataio.corpus.VoxCeleb1SV(dataset_root: str, download_dir: str, force_download: bool = True)[source][source]#

Bases: Corpus

classmethod path2uid(path)[source][source]#

classmethod path2data(paths, speakerid2label)[source][source]#

static format_path(dataset_root, download_dir, force_download: bool)[source][source]#

classmethod format_test_trials(download_dir: str, force_download: bool)[source][source]#

property all_data[source]#

property data_split_ids[source]#

property data_split[source]#

static dataframe_to_datapoints(df: DataFrame, unique_name_fn: callable)[source]#