dataset#

(s3prl.dataio.dataset)

Dataset#

class s3prl.dataio.dataset.Dataset(*args, **kwds)[source][source]#

Bases: Dataset

getinfo(index: int)[source][source]#

DiarizationDataset#

class s3prl.dataio.dataset.DiarizationDataset(mode, data_dir, chunk_size=2000, frame_shift=256, subsampling=1, rate=16000, use_last_samples=True, label_delay=0, num_speakers=None)[source][source]#

Bases: Dataset

getinfo(i)[source][source]#

EncodeCategories#

class s3prl.dataio.dataset.EncodeCategories(labels: List[List[str]], encoders: CategoryEncoders)[source][source]#

Bases: Dataset

getinfo(index: int)[source]#

EncodeCategory#

class s3prl.dataio.dataset.EncodeCategory(labels: List[str], encoder: CategoryEncoder)[source][source]#

Bases: Dataset

getinfo(index: int)[source]#

EncodeMultiLabel#

class s3prl.dataio.dataset.EncodeMultiLabel(labels: List[List[str]], encoder: CategoryEncoder)[source][source]#

Bases: Dataset

static label_to_binary_vector(label_ids: List[int], num_labels: int) Tensor[source][source]#
getinfo(index: int)[source]#

EncodeText#

class s3prl.dataio.dataset.EncodeText(text: List[str], tokenizer: Tokenizer, iob: Optional[List[str]] = None)[source][source]#

Bases: Dataset

getinfo(index: int)[source]#

FrameLabelDataset#

class s3prl.dataio.dataset.FrameLabelDataset(df: DataFrame, num_class: int, frame_shift: int, chunk_secs: float, step_secs: float, use_unfull_chunks: bool = True, load_audio_conf: Optional[dict] = None, sample_rate: int = 16000)[source][source]#

Bases: Dataset

Parameters:

df (pd.DataFrame) – the dataframe should have the following columns record_id (str), wav_path (str), duration (float), utt_id (str), label (int), start_sec (float), end_sec (float)

getinfo(index: int)[source][source]#

LoadAudio#

class s3prl.dataio.dataset.LoadAudio(filepaths: List[str], start_secs: Optional[List[float]] = None, end_secs: Optional[List[float]] = None, sox_effects: Optional[Tuple[Tuple[str]]] = None, individual_sox_effects: Optional[List[Tuple[Tuple[str]]]] = None, max_secs: Optional[float] = None, generator: Optional[Random] = None, sample_rate: int = 16000)[source][source]#

Bases: Dataset

Parameters:
  • start_secs – use None if load from start

  • end_secs – use None if load to end

getinfo(index: int)[source]#

get_info#

s3prl.dataio.dataset.get_info(dataset, names: List[str], cache_dir: Optional[str] = None, n_jobs: int = 6)[source][source]#