From a702995b26b5ad174c1eddde84ad8d7942712bb9 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 20 Nov 2019 19:25:22 +0800 Subject: [PATCH] make grapheme to phoneme an independent part --- g2p/__init__.py | 33 +++++++++++++++ g2p/en/__init__.py | 38 +++++++++++++++++ g2p/es/__init__.py | 17 ++++++++ g2p/jp/__init__.py | 79 ++++++++++++++++++++++++++++++++++++ g2p/ko/__init__.py | 17 ++++++++ g2p/text/__init__.py | 75 ++++++++++++++++++++++++++++++++++ g2p/text/cleaners.py | 97 ++++++++++++++++++++++++++++++++++++++++++++ g2p/text/cmudict.py | 61 ++++++++++++++++++++++++++++ g2p/text/numbers.py | 71 ++++++++++++++++++++++++++++++++ g2p/text/symbols.py | 17 ++++++++ 10 files changed, 505 insertions(+) create mode 100644 g2p/__init__.py create mode 100644 g2p/en/__init__.py create mode 100644 g2p/es/__init__.py create mode 100644 g2p/jp/__init__.py create mode 100644 g2p/ko/__init__.py create mode 100644 g2p/text/__init__.py create mode 100644 g2p/text/cleaners.py create mode 100644 g2p/text/cmudict.py create mode 100644 g2p/text/numbers.py create mode 100644 g2p/text/symbols.py diff --git a/g2p/__init__.py b/g2p/__init__.py new file mode 100644 index 0000000..2d513be --- /dev/null +++ b/g2p/__init__.py @@ -0,0 +1,33 @@ +# coding: utf-8 + +"""Text processing frontend + +All frontend module should have the following functions: + +- text_to_sequence(text, p) +- sequence_to_text(sequence) + +and the property: + +- n_vocab + +""" +from g2p import en + +# optinoal Japanese frontend +try: + from g2p import jp +except ImportError: + jp = None + +try: + from g2p import ko +except ImportError: + ko = None + +# if you are going to use the frontend, you need to modify _characters in symbol.py: +# _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' + '¡¿ñáéíóúÁÉÍÓÚÑ' +try: + from g2p import es +except ImportError: + es = None diff --git a/g2p/en/__init__.py b/g2p/en/__init__.py new file mode 100644 index 0000000..912f2ae --- /dev/null +++ b/g2p/en/__init__.py @@ -0,0 +1,38 @@ +# coding: utf-8 + +from g2p.text.symbols import symbols +from g2p import text +from g2p.text import sequence_to_text + +import nltk +from random import random + +n_vocab = len(symbols) + +_arpabet = nltk.corpus.cmudict.dict() + + +def _maybe_get_arpabet(word, p): + try: + phonemes = _arpabet[word][0] + phonemes = " ".join(phonemes) + except KeyError: + return word + + return '{%s}' % phonemes if random() < p else word + + +def mix_pronunciation(text, p): + text = ' '.join(_maybe_get_arpabet(word, p) for word in text.split(' ')) + return text + + +def text_to_sequence(text, p=0.0): + if p >= 0: + text = mix_pronunciation(text, p) + from g2p.text import text_to_sequence + text = text_to_sequence(text, ["english_cleaners"]) + return text + + + diff --git a/g2p/es/__init__.py b/g2p/es/__init__.py new file mode 100644 index 0000000..d090468 --- /dev/null +++ b/g2p/es/__init__.py @@ -0,0 +1,17 @@ +# coding: utf-8 +from g2p.text.symbols import symbols +from g2p.text import sequence_to_text + +import nltk +from random import random + +n_vocab = len(symbols) + + +def text_to_sequence(text, p=0.0): + from g2p.text import text_to_sequence + text = text_to_sequence(text, ["basic_cleaners"]) + return text + + + diff --git a/g2p/jp/__init__.py b/g2p/jp/__init__.py new file mode 100644 index 0000000..dcb0845 --- /dev/null +++ b/g2p/jp/__init__.py @@ -0,0 +1,79 @@ +# coding: utf-8 + + +import MeCab +import jaconv +from random import random + +n_vocab = 0xffff + +_eos = 1 +_pad = 0 +_tagger = None + + +def _yomi(mecab_result): + tokens = [] + yomis = [] + for line in mecab_result.split("\n")[:-1]: + s = line.split("\t") + if len(s) == 1: + break + token, rest = s + rest = rest.split(",") + tokens.append(token) + yomi = rest[7] if len(rest) > 7 else None + yomi = None if yomi == "*" else yomi + yomis.append(yomi) + + return tokens, yomis + + +def _mix_pronunciation(tokens, yomis, p): + return "".join( + yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx] + for idx in range(len(tokens))) + + +def mix_pronunciation(text, p): + global _tagger + if _tagger is None: + _tagger = MeCab.Tagger("") + tokens, yomis = _yomi(_tagger.parse(text)) + return _mix_pronunciation(tokens, yomis, p) + + +def add_punctuation(text): + last = text[-1] + if last not in [".", ",", "、", "。", "!", "?", "!", "?"]: + text = text + "。" + return text + + +def normalize_delimitor(text): + text = text.replace(",", "、") + text = text.replace(".", "。") + text = text.replace(",", "、") + text = text.replace(".", "。") + return text + + +def text_to_sequence(text, p=0.0): + for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】", + "(", ")", "(", ")"]: + text = text.replace(c, "") + text = text.replace("!", "!") + text = text.replace("?", "?") + + text = normalize_delimitor(text) + text = jaconv.normalize(text) + if p > 0: + text = mix_pronunciation(text, p) + text = jaconv.hira2kata(text) + text = add_punctuation(text) + + return [ord(c) for c in text] + [_eos] # EOS + + +def sequence_to_text(seq): + return "".join(chr(n) for n in seq) diff --git a/g2p/ko/__init__.py b/g2p/ko/__init__.py new file mode 100644 index 0000000..2a6465b --- /dev/null +++ b/g2p/ko/__init__.py @@ -0,0 +1,17 @@ +# coding: utf-8 + + +from random import random + +n_vocab = 0xffff + +_eos = 1 +_pad = 0 +_tagger = None + + +def text_to_sequence(text, p=0.0): + return [ord(c) for c in text] + [_eos] # EOS + +def sequence_to_text(seq): + return "".join(chr(n) for n in seq) diff --git a/g2p/text/__init__.py b/g2p/text/__init__.py new file mode 100644 index 0000000..0be6d4a --- /dev/null +++ b/g2p/text/__init__.py @@ -0,0 +1,75 @@ +import re +from g2p.text import cleaners +from g2p.text.symbols import symbols + + +# Mappings from symbol to numeric ID and vice versa: +_symbol_to_id = {s: i for i, s in enumerate(symbols)} +_id_to_symbol = {i: s for i, s in enumerate(symbols)} + +# Regular expression matching text enclosed in curly braces: +_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') + + +def text_to_sequence(text, cleaner_names): + '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + + The text can optionally have ARPAbet sequences enclosed in curly braces embedded + in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." + + Args: + text: string to convert to a sequence + cleaner_names: names of the cleaner functions to run the text through + + Returns: + List of integers corresponding to the symbols in the text + ''' + sequence = [] + + # Check for curly braces and treat their contents as ARPAbet: + while len(text): + m = _curly_re.match(text) + if not m: + sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) + break + sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) + sequence += _arpabet_to_sequence(m.group(2)) + text = m.group(3) + + # Append EOS token + sequence.append(_symbol_to_id['~']) + return sequence + + +def sequence_to_text(sequence): + '''Converts a sequence of IDs back to a string''' + result = '' + for symbol_id in sequence: + if symbol_id in _id_to_symbol: + s = _id_to_symbol[symbol_id] + # Enclose ARPAbet back in curly braces: + if len(s) > 1 and s[0] == '@': + s = '{%s}' % s[1:] + result += s + return result.replace('}{', ' ') + + +def _clean_text(text, cleaner_names): + for name in cleaner_names: + cleaner = getattr(cleaners, name) + if not cleaner: + raise Exception('Unknown cleaner: %s' % name) + text = cleaner(text) + return text + + +def _symbols_to_sequence(symbols): + return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] + + +def _arpabet_to_sequence(text): + return _symbols_to_sequence(['@' + s for s in text.split()]) + + +def _should_keep_symbol(s): + return s in _symbol_to_id and s is not '_' and s is not '~' diff --git a/g2p/text/cleaners.py b/g2p/text/cleaners.py new file mode 100644 index 0000000..86bf9f3 --- /dev/null +++ b/g2p/text/cleaners.py @@ -0,0 +1,97 @@ +''' +Cleaners are transformations that run over the input text at both training and eval time. + +Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" +hyperparameter. Some cleaners are English-specific. You'll typically want to use: + 1. "english_cleaners" for English text + 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using + the Unidecode library (https://pypi.python.org/pypi/Unidecode) + 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update + the symbols in symbols.py to match your data). +''' + +import re +from unidecode import unidecode +from .numbers import normalize_numbers + + +# Regular expression matching whitespace: +_whitespace_re = re.compile(r'\s+') + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ + ('mrs', 'misess'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), +]] + + +def expand_abbreviations(text): + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text + + +def expand_numbers(text): + return normalize_numbers(text) + + +def lowercase(text): + return text.lower() + + +def collapse_whitespace(text): + return re.sub(_whitespace_re, ' ', text) + + +def convert_to_ascii(text): + return unidecode(text) + + +def add_punctuation(text): + if len(text) == 0: + return text + if text[-1] not in '!,.:;?': + text = text + '.' # without this decoder is confused when to output EOS + return text + + +def basic_cleaners(text): + '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def transliteration_cleaners(text): + '''Pipeline for non-English text that transliterates to ASCII.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def english_cleaners(text): + '''Pipeline for English text, including number and abbreviation expansion.''' + text = convert_to_ascii(text) + text = add_punctuation(text) + text = lowercase(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + text = collapse_whitespace(text) + return text diff --git a/g2p/text/cmudict.py b/g2p/text/cmudict.py new file mode 100644 index 0000000..1f1ea9b --- /dev/null +++ b/g2p/text/cmudict.py @@ -0,0 +1,61 @@ +import re + + +valid_symbols = [ + 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', + 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', + 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', + 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', + 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', + 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', + 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' +] + +_valid_symbol_set = set(valid_symbols) + + +class CMUDict: + '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' + + def __init__(self, file_or_path, keep_ambiguous=True): + if isinstance(file_or_path, str): + with open(file_or_path, encoding='latin-1') as f: + entries = _parse_cmudict(f) + else: + entries = _parse_cmudict(file_or_path) + if not keep_ambiguous: + entries = {word: pron for word, pron in entries.items() if len(pron) == 1} + self._entries = entries + + def __len__(self): + return len(self._entries) + + def lookup(self, word): + '''Returns list of ARPAbet pronunciations of the given word.''' + return self._entries.get(word.upper()) + + +_alt_re = re.compile(r'\([0-9]+\)') + + +def _parse_cmudict(file): + cmudict = {} + for line in file: + if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): + parts = line.split(' ') + word = re.sub(_alt_re, '', parts[0]) + pronunciation = _get_pronunciation(parts[1]) + if pronunciation: + if word in cmudict: + cmudict[word].append(pronunciation) + else: + cmudict[word] = [pronunciation] + return cmudict + + +def _get_pronunciation(s): + parts = s.strip().split(' ') + for part in parts: + if part not in _valid_symbol_set: + return None + return ' '.join(parts) diff --git a/g2p/text/numbers.py b/g2p/text/numbers.py new file mode 100644 index 0000000..93f676d --- /dev/null +++ b/g2p/text/numbers.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- + +import inflect +import re + + +_inflect = inflect.engine() +_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') +_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') +_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') +_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_number_re = re.compile(r'[0-9]+') + + +def _remove_commas(m): + return m.group(1).replace(',', '') + + +def _expand_decimal_point(m): + return m.group(1).replace('.', ' point ') + + +def _expand_dollars(m): + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' dollars' # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) + elif dollars: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + return '%s %s' % (dollars, dollar_unit) + elif cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s' % (cents, cent_unit) + else: + return 'zero dollars' + + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + + +def _expand_number(m): + num = int(m.group(0)) + if num > 1000 and num < 3000: + if num == 2000: + return 'two thousand' + elif num > 2000 and num < 2010: + return 'two thousand ' + _inflect.number_to_words(num % 100) + elif num % 100 == 0: + return _inflect.number_to_words(num // 100) + ' hundred' + else: + return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') + else: + return _inflect.number_to_words(num, andword='') + + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r'\1 pounds', text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + return text diff --git a/g2p/text/symbols.py b/g2p/text/symbols.py new file mode 100644 index 0000000..da87c93 --- /dev/null +++ b/g2p/text/symbols.py @@ -0,0 +1,17 @@ +''' +Defines the set of symbols used in text input to the model. + +The default is a set of ASCII characters that works well for English or text that has been run +through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. +''' +from .cmudict import valid_symbols + +_pad = '_' +_eos = '~' +_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' + +# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): +_arpabet = ['@' + s for s in valid_symbols] + +# Export all symbols: +symbols = [_pad, _eos] + list(_characters) + _arpabet