encoder#

(s3prl.dataio.encoder)

Encode the raw data into numeric format, and then decode it

s3prl.dataio.encoder.category

Simple categorical encoder

s3prl.dataio.encoder.g2p

Basic G2P

s3prl.dataio.encoder.tokenizer

Load tokenizer to encode & decode

s3prl.dataio.encoder.vocabulary

Create vocabulary (train tokenizer)

CategoryEncoder#

class s3prl.dataio.encoder.CategoryEncoder(category: List[str])[source][source]#

Bases: object

encode(label: str) int[source][source]#
decode(index: int) str[source][source]#

CategoryEncoders#

class s3prl.dataio.encoder.CategoryEncoders(categories: List[List[str]])[source][source]#

Bases: object

encode(labels: List[str]) List[int][source][source]#
decode(indices: List[int]) List[str][source][source]#

G2P#

class s3prl.dataio.encoder.G2P(file_list: Optional[List[str]] = None, allow_unk: bool = False)[source][source]#

Bases: object

Grapheme-to-phoneme

Parameters:
  • file_list (List[str], optional) – List of lexicon files. Defaults to None.

  • allow_unk (bool) – If false, raise Error when a word can not be recognized by this basic G2P

encode(text: str) str[source][source]#

Converts grapheme-based sentences to phonemes

Parameters:

text (str) – Sentence

Returns:

Phonemized sentence

Return type:

str

Tokenizer#

class s3prl.dataio.encoder.Tokenizer[source][source]#

Bases: object

abstract encode(text: str, iob: Optional[str] = None) List[int][source][source]#
abstract decode(idxs: List[int], ignore_repeat: bool = False) str[source][source]#
abstract property vocab_size: int[source]#
abstract property token_type: str[source]#
abstract classmethod load_from_file(vocab_file: str)[source][source]#
property pad_idx: int[source]#
property eos_idx: int[source]#
property unk_idx: int[source]#

BertTokenizer#

class s3prl.dataio.encoder.BertTokenizer(tokenizer)[source][source]#

Bases: Tokenizer

Bert Tokenizer.

https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/tokenization_bert.py

encode(s: str) List[int][source][source]#
decode(idxs: List[int], ignore_repeat: bool = False) str[source][source]#
classmethod load_from_file(vocab_file: str)[source][source]#
property vocab_size: int[source]#
property token_type: str[source]#
property eos_idx: int[source]#
property pad_idx: int[source]#
property unk_idx: int[source]#

WordTokenizer#

class s3prl.dataio.encoder.WordTokenizer(vocab_list: Optional[List[str]] = None)[source][source]#

Bases: CharacterTokenizer

Word tokenizer.

encode(s: str) List[int][source][source]#
decode(idxs: List[int], ignore_repeat: bool = False) str[source][source]#
property token_type: str[source]#
property eos_idx: int[source]#
idx_to_vocab(idx)[source]#
classmethod load_from_file(vocab_file: Optional[str] = None, vocab_list: Optional[List[str]] = None)[source]#
property pad_idx: int[source]#
property unk_idx: int[source]#
property vocab_size: int[source]#
vocab_to_idx(vocab)[source]#

CharacterTokenizer#

class s3prl.dataio.encoder.CharacterTokenizer(vocab_list: Optional[List[str]] = None)[source][source]#

Bases: Tokenizer

Character tokenizer.

encode(s: str) List[int][source][source]#
decode(idxs: List[int], ignore_repeat: bool = False) str[source][source]#
classmethod load_from_file(vocab_file: Optional[str] = None, vocab_list: Optional[List[str]] = None)[source][source]#
property vocab_size: int[source]#
property token_type: str[source]#
vocab_to_idx(vocab)[source][source]#
idx_to_vocab(idx)[source][source]#
property eos_idx: int[source]#
property pad_idx: int[source]#
property unk_idx: int[source]#

CharacterSlotTokenizer#

class s3prl.dataio.encoder.CharacterSlotTokenizer(vocab_list: List[str], slots: List[str])[source][source]#

Bases: Tokenizer

Character tokenizer with slots.

encode(sent: str, iobs: str) List[int][source][source]#
decode(idxs: List[int], ignore_repeat: bool = False) str[source][source]#
classmethod load_from_file(vocab_file: str, slots_file: str)[source][source]#
property vocab_size: int[source]#
property token_type: str[source]#
vocab_to_idx(vocab)[source][source]#
idx_to_vocab(idx)[source][source]#
property eos_idx: int[source]#
property pad_idx: int[source]#
property unk_idx: int[source]#

SubwordTokenizer#

class s3prl.dataio.encoder.SubwordTokenizer(spm)[source][source]#

Bases: Tokenizer

Subword tokenizer using sentencepiece.

encode(s: str) List[int][source][source]#
decode(idxs: List[int], ignore_repeat: bool = False) str[source][source]#
classmethod load_from_file(filepath: str)[source][source]#
property vocab_size: int[source]#
property token_type: str[source]#
property eos_idx: int[source]#
property pad_idx: int[source]#
property unk_idx: int[source]#

SubwordSlotTokenizer#

class s3prl.dataio.encoder.SubwordSlotTokenizer(spm, slots)[source][source]#

Bases: Tokenizer

Subword tokenizer with slots.

encode(sent: str, iobs: str) List[int][source][source]#
decode(idxs: List[int], ignore_repeat: bool = False) str[source][source]#
classmethod load_from_file(filepath: str, slots_file: str)[source][source]#
property vocab_size: int[source]#
property token_type: str[source]#
property eos_idx: int[source]#
property pad_idx: int[source]#
property unk_idx: int[source]#

generate_basic_vocab#

s3prl.dataio.encoder.generate_basic_vocab(mode: str, text_list: List[str], vocab_size: int = -1, coverage: float = 1.0, sort_vocab: bool = True) List[str][source][source]#

Generates basic vocabularies, including character and word-based vocabularies.

Parameters:
  • mode (str) – Vocabulary type (character or word).

  • text_list (List[str]) – List of text data.

  • vocab_size (int, optional) – Vocabulary size, if not specified, vocab_size would be coverage * actual vocab size. Defaults to -1.

  • coverage (float, optional) – Vocabulary coverage. Defaults to 1.0.

  • sort_vocab (bool, optional) – Sort vocabularies alphabetically. Defaults to True.

Returns:

A list of vocabularies.

Return type:

List[str]

generate_subword_vocab#

s3prl.dataio.encoder.generate_subword_vocab(text_list: Optional[List[str]] = None, text_file: Optional[str] = None, output_file: Optional[str] = None, vocab_size: int = 1000, character_coverage: float = 1.0) str[source][source]#

Generates subword vocabularies based on sentencepiece.

Parameters:
  • text_list (List[str], optional) – List of text data. Defaults to None.

  • text_file (str, optional) – Path to text data. Defaults to None.

  • output_file (str, optional) – Path to save trained subword vocabularies. Defaults to “”.

  • vocab_size (int, optional) – Vocabulary size. Defaults to 8000.

  • character_coverage (float, optional) – Coverage of characters in text data. Defaults to 1.0.

Raises:

ImportError – If sentencepiece is not installed.

Returns:

Path to ${output_file}.model.

Return type:

str

generate_vocab#

s3prl.dataio.encoder.generate_vocab(mode: str, text_list: Optional[List[str]] = None, text_file: Optional[str] = None, read_lines: int = 10000000, **vocab_args) Union[List[str], str][source][source]#

Generates vocabularies given text data.

Parameters:
  • mode (str) – Vocabulary type

  • text_list (List[str], optional) – List of text data. Defaults to None.

  • text_file (str, optional) – Path to text data. Defaults to None.

  • read_lines (int, optional) – Maximum lines to read from text_file. Defaults to 10000000.

  • vocab_args – if mode != subword, arguments for generate_basic_vocab if mode == subword, arguments for generate_subword_vocab

Returns:

A list of vocabularies or a path to .vocab file.

Return type:

Union[List[str], str]