Source code for s3prl.dataio.encoder.g2p

"""
Basic G2P

Authors:
  * Heng-Jui Chang 2022
"""

import logging
from collections import defaultdict
from typing import Dict, List, Tuple

from s3prl.util.download import _urls_to_filepaths

DEFAULT_LEXICON_URL = [
    "https://huggingface.co/datasets/s3prl/g2p/raw/main/lexicon/librispeech-lexicon-200k-g2p.txt",
    "https://huggingface.co/datasets/s3prl/g2p/raw/main/lexicon/librispeech-lexicon-allothers-g2p.txt",
]

__all__ = ["G2P"]


def parse_lexicon(line: str) -> Tuple[str, List[str]]:
    line.replace("\t", " ")
    word, *phonemes = line.split()
    return word, phonemes


def read_lexicon_files(file_list: List[str]) -> Dict[str, List[str]]:
    w2p_dict = defaultdict(list)
    for file in file_list:
        with open(file, "r") as fp:
            lines = [line.strip() for line in fp]
            for line in lines:
                word, phonemes = parse_lexicon(line)
                w2p_dict[word].append(phonemes)

    w2p = {}
    for word, phonemes_all in w2p_dict.items():
        if len(phonemes_all) > 1:
            logging.info(f"{len(phonemes_all)} phoneme sequences found for {word}.")
            for i, phonemes in enumerate(phonemes_all):
                logging.info(f"{i}. {phonemes}")
        w2p[word] = phonemes_all[0]
    logging.info("Taking the first phoneme sequences for a deterministic behavior.")

    return w2p


[docs]class G2P: """Grapheme-to-phoneme Args: file_list (List[str], optional): List of lexicon files. Defaults to None. allow_unk (bool): If false, raise Error when a word can not be recognized by this basic G2P """ def __init__(self, file_list: List[str] = None, allow_unk: bool = False): self.allow_unk = allow_unk if file_list is None: file_list = _urls_to_filepaths(*DEFAULT_LEXICON_URL) self.word2phone = read_lexicon_files(file_list)
[docs] def encode(self, text: str) -> str: """Converts grapheme-based sentences to phonemes Args: text (str): Sentence Returns: str: Phonemized sentence """ word_list = text.strip().upper().split(" ") phonemes = [] for word in word_list: if not self.allow_unk: assert word in self.word2phone phonemes += self.word2phone.get(word, ["<UNK>"]) return " ".join(phonemes)