tokenizer#

(s3prl.dataio.encoder.tokenizer)

Load tokenizer to encode & decode

Modified from tensorflow_datasets.features.text.* Reference: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text_lib

Authors:
  • Heng-Jui Chang 2022

CharacterTokenizer#

class s3prl.dataio.encoder.tokenizer.CharacterTokenizer(vocab_list: Optional[List[str]] = None)[source][source]#

Bases: Tokenizer

Character tokenizer.

encode(s: str) List[int][source][source]#
decode(idxs: List[int], ignore_repeat: bool = False) str[source][source]#
classmethod load_from_file(vocab_file: Optional[str] = None, vocab_list: Optional[List[str]] = None)[source][source]#
property vocab_size: int[source]#
property token_type: str[source]#
vocab_to_idx(vocab)[source][source]#
idx_to_vocab(idx)[source][source]#
property eos_idx: int[source]#
property pad_idx: int[source]#
property unk_idx: int[source]#

CharacterSlotTokenizer#

class s3prl.dataio.encoder.tokenizer.CharacterSlotTokenizer(vocab_list: List[str], slots: List[str])[source][source]#

Bases: Tokenizer

Character tokenizer with slots.

encode(sent: str, iobs: str) List[int][source][source]#
decode(idxs: List[int], ignore_repeat: bool = False) str[source][source]#
classmethod load_from_file(vocab_file: str, slots_file: str)[source][source]#
property vocab_size: int[source]#
property token_type: str[source]#
vocab_to_idx(vocab)[source][source]#
idx_to_vocab(idx)[source][source]#
property eos_idx: int[source]#
property pad_idx: int[source]#
property unk_idx: int[source]#

SubwordTokenizer#

class s3prl.dataio.encoder.tokenizer.SubwordTokenizer(spm)[source][source]#

Bases: Tokenizer

Subword tokenizer using sentencepiece.

encode(s: str) List[int][source][source]#
decode(idxs: List[int], ignore_repeat: bool = False) str[source][source]#
classmethod load_from_file(filepath: str)[source][source]#
property vocab_size: int[source]#
property token_type: str[source]#
property eos_idx: int[source]#
property pad_idx: int[source]#
property unk_idx: int[source]#

SubwordSlotTokenizer#

class s3prl.dataio.encoder.tokenizer.SubwordSlotTokenizer(spm, slots)[source][source]#

Bases: Tokenizer

Subword tokenizer with slots.

encode(sent: str, iobs: str) List[int][source][source]#
decode(idxs: List[int], ignore_repeat: bool = False) str[source][source]#
classmethod load_from_file(filepath: str, slots_file: str)[source][source]#
property vocab_size: int[source]#
property token_type: str[source]#
property eos_idx: int[source]#
property pad_idx: int[source]#
property unk_idx: int[source]#

WordTokenizer#

class s3prl.dataio.encoder.tokenizer.WordTokenizer(vocab_list: Optional[List[str]] = None)[source][source]#

Bases: CharacterTokenizer

Word tokenizer.

encode(s: str) List[int][source][source]#
decode(idxs: List[int], ignore_repeat: bool = False) str[source][source]#
property token_type: str[source]#
property eos_idx: int[source]#
idx_to_vocab(idx)[source]#
classmethod load_from_file(vocab_file: Optional[str] = None, vocab_list: Optional[List[str]] = None)[source]#
property pad_idx: int[source]#
property unk_idx: int[source]#
property vocab_size: int[source]#
vocab_to_idx(vocab)[source]#

PhonemeTokenizer#

class s3prl.dataio.encoder.tokenizer.PhonemeTokenizer(vocab_list: Optional[List[str]] = None)[source][source]#

Bases: WordTokenizer

Phoneme tokenizer.

property token_type: str[source]#
decode(idxs: List[int], ignore_repeat: bool = False) str[source]#
encode(s: str) List[int][source]#
property eos_idx: int[source]#
idx_to_vocab(idx)[source]#
classmethod load_from_file(vocab_file: Optional[str] = None, vocab_list: Optional[List[str]] = None)[source]#
property pad_idx: int[source]#
property unk_idx: int[source]#
property vocab_size: int[source]#
vocab_to_idx(vocab)[source]#

load_tokenizer#

s3prl.dataio.encoder.tokenizer.load_tokenizer(mode: str, vocab_file: Optional[str] = None, vocab_list: Optional[List[str]] = None, slots_file: Optional[str] = None) Tokenizer[source][source]#

Load a text tokenizer.

Parameters:
  • mode (str) – Mode (“character”, “character-slot”, “subword”, “subword-slot”, “word”, “bert-…”)

  • vocab_file (str, optional) – Path to vocabularies. Defaults to None.

  • vocab_list (List[str], optional) – List of vocabularies. Defaults to None.

  • slots_file (str, optional) – Path to slots. Defaults to None.

Raises:

NotImplementedError – If mode is not implemented.

Returns:

Text tokenizer.

Return type:

Tokenizer

default_phoneme_tokenizer#

s3prl.dataio.encoder.tokenizer.default_phoneme_tokenizer() PhonemeTokenizer[source][source]#

Returns a default LibriSpeech phoneme tokenizer.

Returns:

Vocabs include 71 phonemes

Return type:

PhonemeTokenizer