80 lines
1.8 KiB
Python
80 lines
1.8 KiB
Python
# coding: utf-8
|
||
|
||
|
||
import MeCab
|
||
import jaconv
|
||
from random import random
|
||
|
||
n_vocab = 0xffff
|
||
|
||
_eos = 1
|
||
_pad = 0
|
||
_tagger = None
|
||
|
||
|
||
def _yomi(mecab_result):
|
||
tokens = []
|
||
yomis = []
|
||
for line in mecab_result.split("\n")[:-1]:
|
||
s = line.split("\t")
|
||
if len(s) == 1:
|
||
break
|
||
token, rest = s
|
||
rest = rest.split(",")
|
||
tokens.append(token)
|
||
yomi = rest[7] if len(rest) > 7 else None
|
||
yomi = None if yomi == "*" else yomi
|
||
yomis.append(yomi)
|
||
|
||
return tokens, yomis
|
||
|
||
|
||
def _mix_pronunciation(tokens, yomis, p):
|
||
return "".join(
|
||
yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx]
|
||
for idx in range(len(tokens)))
|
||
|
||
|
||
def mix_pronunciation(text, p):
|
||
global _tagger
|
||
if _tagger is None:
|
||
_tagger = MeCab.Tagger("")
|
||
tokens, yomis = _yomi(_tagger.parse(text))
|
||
return _mix_pronunciation(tokens, yomis, p)
|
||
|
||
|
||
def add_punctuation(text):
|
||
last = text[-1]
|
||
if last not in [".", ",", "、", "。", "!", "?", "!", "?"]:
|
||
text = text + "。"
|
||
return text
|
||
|
||
|
||
def normalize_delimitor(text):
|
||
text = text.replace(",", "、")
|
||
text = text.replace(".", "。")
|
||
text = text.replace(",", "、")
|
||
text = text.replace(".", "。")
|
||
return text
|
||
|
||
|
||
def text_to_sequence(text, p=0.0):
|
||
for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】",
|
||
"(", ")", "(", ")"]:
|
||
text = text.replace(c, "")
|
||
text = text.replace("!", "!")
|
||
text = text.replace("?", "?")
|
||
|
||
text = normalize_delimitor(text)
|
||
text = jaconv.normalize(text)
|
||
if p > 0:
|
||
text = mix_pronunciation(text, p)
|
||
text = jaconv.hira2kata(text)
|
||
text = add_punctuation(text)
|
||
|
||
return [ord(c) for c in text] + [_eos] # EOS
|
||
|
||
|
||
def sequence_to_text(seq):
|
||
return "".join(chr(n) for n in seq)
|