ParakeetRebeccaRosario/parakeet/g2p/jp/__init__.py

# coding: utf-8


import MeCab
import jaconv
from random import random

n_vocab = 0xffff

_eos = 1
_pad = 0
_tagger = None


def _yomi(mecab_result):
    tokens = []
    yomis = []
    for line in mecab_result.split("\n")[:-1]:
        s = line.split("\t")
        if len(s) == 1:
            break
        token, rest = s
        rest = rest.split(",")
        tokens.append(token)
        yomi = rest[7] if len(rest) > 7 else None
        yomi = None if yomi == "*" else yomi
        yomis.append(yomi)

    return tokens, yomis


def _mix_pronunciation(tokens, yomis, p):
    return "".join(
        yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx]
        for idx in range(len(tokens)))


def mix_pronunciation(text, p):
    global _tagger
    if _tagger is None:
        _tagger = MeCab.Tagger("")
    tokens, yomis = _yomi(_tagger.parse(text))
    return _mix_pronunciation(tokens, yomis, p)


def add_punctuation(text):
    last = text[-1]
    if last not in [".", ",", "、", "。", "！", "？", "!", "?"]:
        text = text + "。"
    return text


def normalize_delimitor(text):
    text = text.replace(",", "、")
    text = text.replace(".", "。")
    text = text.replace("，", "、")
    text = text.replace("．", "。")
    return text


def text_to_sequence(text, p=0.0):
    for c in [" ", "　", "「", "」", "『", "』", "・", "【", "】",
              "（", "）", "(", ")"]:
        text = text.replace(c, "")
    text = text.replace("!", "！")
    text = text.replace("?", "？")

    text = normalize_delimitor(text)
    text = jaconv.normalize(text)
    if p > 0:
        text = mix_pronunciation(text, p)
    text = jaconv.hira2kata(text)
    text = add_punctuation(text)

    return [ord(c) for c in text] + [_eos]  # EOS


def sequence_to_text(seq):
    return "".join(chr(n) for n in seq)