From 580655f33f1b8d1e3ae449ae13d90f6d9effbf66 Mon Sep 17 00:00:00 2001 From: iclementine Date: Tue, 20 Oct 2020 16:06:11 +0800 Subject: [PATCH] add phonetics & vocab & punctuation --- parakeet/__init__.py | 2 +- parakeet/frontend/__init__.py | 3 + parakeet/frontend/normalizer/abbrrviation.py | 0 parakeet/frontend/normalizer/acronyms.py | 0 parakeet/frontend/normalizer/normalizer.py | 0 parakeet/frontend/normalizer/numbers.py | 3 + parakeet/frontend/normalizer/width.py | 24 ++++ parakeet/frontend/phonectic.py | 85 ++++++++++++++ parakeet/frontend/punctuation.py | 33 ++++++ parakeet/frontend/vocab.py | 79 +++++++++++++ parakeet/g2p/__init__.py | 32 ------ parakeet/g2p/en/__init__.py | 34 ------ parakeet/g2p/es/__init__.py | 14 --- parakeet/g2p/jp/__init__.py | 77 ------------- parakeet/g2p/ko/__init__.py | 17 --- parakeet/g2p/text/__init__.py | 89 --------------- parakeet/g2p/text/cleaners.py | 110 ------------------- parakeet/g2p/text/cmudict.py | 78 ------------- parakeet/g2p/text/numbers.py | 71 ------------ parakeet/g2p/text/symbols.py | 30 ----- 20 files changed, 228 insertions(+), 553 deletions(-) create mode 100644 parakeet/frontend/__init__.py create mode 100644 parakeet/frontend/normalizer/abbrrviation.py create mode 100644 parakeet/frontend/normalizer/acronyms.py create mode 100644 parakeet/frontend/normalizer/normalizer.py create mode 100644 parakeet/frontend/normalizer/numbers.py create mode 100644 parakeet/frontend/normalizer/width.py create mode 100644 parakeet/frontend/phonectic.py create mode 100644 parakeet/frontend/punctuation.py create mode 100644 parakeet/frontend/vocab.py delete mode 100644 parakeet/g2p/__init__.py delete mode 100644 parakeet/g2p/en/__init__.py delete mode 100644 parakeet/g2p/es/__init__.py delete mode 100644 parakeet/g2p/jp/__init__.py delete mode 100644 parakeet/g2p/ko/__init__.py delete mode 100644 parakeet/g2p/text/__init__.py delete mode 100644 parakeet/g2p/text/cleaners.py delete mode 100644 parakeet/g2p/text/cmudict.py delete mode 100644 parakeet/g2p/text/numbers.py delete mode 100644 parakeet/g2p/text/symbols.py diff --git a/parakeet/__init__.py b/parakeet/__init__.py index 9be1aaf..4f26116 100644 --- a/parakeet/__init__.py +++ b/parakeet/__init__.py @@ -14,4 +14,4 @@ __version__ = "0.0.0" -from . import data, g2p, models, modules +from parakeet import data, frontend, models, modules diff --git a/parakeet/frontend/__init__.py b/parakeet/frontend/__init__.py new file mode 100644 index 0000000..c49b725 --- /dev/null +++ b/parakeet/frontend/__init__.py @@ -0,0 +1,3 @@ +from parakeet.frontend.vocab import * +from parakeet.frontend.phonectic import * +from parakeet.frontend.punctuation import * diff --git a/parakeet/frontend/normalizer/abbrrviation.py b/parakeet/frontend/normalizer/abbrrviation.py new file mode 100644 index 0000000..e69de29 diff --git a/parakeet/frontend/normalizer/acronyms.py b/parakeet/frontend/normalizer/acronyms.py new file mode 100644 index 0000000..e69de29 diff --git a/parakeet/frontend/normalizer/normalizer.py b/parakeet/frontend/normalizer/normalizer.py new file mode 100644 index 0000000..e69de29 diff --git a/parakeet/frontend/normalizer/numbers.py b/parakeet/frontend/normalizer/numbers.py new file mode 100644 index 0000000..ef7343c --- /dev/null +++ b/parakeet/frontend/normalizer/numbers.py @@ -0,0 +1,3 @@ +# number expansion is not that easy +import num2words +import inflect \ No newline at end of file diff --git a/parakeet/frontend/normalizer/width.py b/parakeet/frontend/normalizer/width.py new file mode 100644 index 0000000..440557f --- /dev/null +++ b/parakeet/frontend/normalizer/width.py @@ -0,0 +1,24 @@ +def full2half_width(ustr): + half = [] + for u in ustr: + num = ord(u) + if num == 0x3000: # 全角空格变半角 + num = 32 + elif 0xFF01 <= num <= 0xFF5E: + num -= 0xfee0 + u = chr(num) + half.append(u) + return ''.join(half) + +def half2full_width(ustr): + full = [] + for u in ustr: + num = ord(u) + if num == 32: # 半角空格变全角 + num = 0x3000 + elif 0x21 <= num <= 0x7E: + num += 0xfee0 + u = chr(num) # to unicode + full.append(u) + + return ''.join(full) \ No newline at end of file diff --git a/parakeet/frontend/phonectic.py b/parakeet/frontend/phonectic.py new file mode 100644 index 0000000..34be5fc --- /dev/null +++ b/parakeet/frontend/phonectic.py @@ -0,0 +1,85 @@ +from abc import ABC, abstractmethod +from typing import Union +from g2p_en import G2p +from g2pM import G2pM +from parakeet.frontend import Vocab +from opencc import OpenCC +from parakeet.frontend.punctuation import get_punctuations + +class Phonetics(ABC): + @abstractmethod + def __call__(self, sentence): + pass + + @abstractmethod + def phoneticize(self, sentence): + pass + + @abstractmethod + def numericalize(self, phonemes): + pass + +class English(Phonetics): + def __init__(self): + self.backend = G2p() + self.phonemes = list(self.backend.phonemes) + self.punctuations = get_punctuations("en") + self.vocab = Vocab(self.phonemes + self.punctuations) + + def phoneticize(self, sentence): + return self.backend(sentence) + + def numericalize(self, phonemes): + ids = [self.vocab.lookup(item) for item in phonemes if item in self.vocab.stoi] + return ids + + def reverse(self, ids): + return [self.vocab.reverse(i) for i in ids] + + def __call__(self, sentence): + return self.numericalize(self.phoneticize(sentence)) + + def vocab_size(self): + return len(self.vocab) + + +class Chinese(Phonetics): + def __init__(self): + self.opencc_backend = OpenCC('t2s.json') + self.backend = G2pM() + self.phonemes = self._get_all_syllables() + self.punctuations = get_punctuations("cn") + self.vocab = Vocab(self.phonemes + self.punctuations) + + def _get_all_syllables(self): + all_syllables = set([syllable for k, v in self.backend.cedict.items() for syllable in v]) + return list(all_syllables) + + def phoneticize(self, sentence): + simplified = self.opencc_backend.convert(sentence) + phonemes = self.backend(simplified) + return self._filter_symbols(phonemes) + + def _filter_symbols(self, phonemes): + cleaned_phonemes = [] + for item in phonemes: + if item in self.vocab.stoi: + cleaned_phonemes.append(item) + else: + for char in item: + if char in self.vocab.stoi: + cleaned_phonemes.append(char) + return cleaned_phonemes + + def numericalize(self, phonemes): + ids = [self.vocab.lookup(item) for item in phonemes] + return ids + + def __call__(self, sentence): + return self.numericalize(self.phoneticize(sentence)) + + def vocab_size(self): + return len(self.vocab) + + def reverse(self, ids): + return [self.vocab.reverse(i) for i in ids] diff --git a/parakeet/frontend/punctuation.py b/parakeet/frontend/punctuation.py new file mode 100644 index 0000000..9984970 --- /dev/null +++ b/parakeet/frontend/punctuation.py @@ -0,0 +1,33 @@ +import abc +import string + +__all__ = ["get_punctuations"] + +EN_PUNCT = [ + " ", + "-", + "...", + ",", + ".", + "?", + "!", +] + +CN_PUNCT = [ + "、", + ",", + ";", + ":", + "。", + "?", + "!" +] + +def get_punctuations(lang): + if lang == "en": + return EN_PUNCT + elif lang == "cn": + return CN_PUNCT + else: + raise ValueError(f"language {lang} Not supported") + diff --git a/parakeet/frontend/vocab.py b/parakeet/frontend/vocab.py new file mode 100644 index 0000000..3dbf316 --- /dev/null +++ b/parakeet/frontend/vocab.py @@ -0,0 +1,79 @@ +from typing import Dict, Iterable, List +from ruamel import yaml +from collections import OrderedDict + +class Vocab(object): + def __init__(self, symbols: Iterable[str], + padding_symbol="", + unk_symbol="", + start_symbol="", + end_symbol=""): + self.special_symbols = OrderedDict() + for i, item in enumerate( + [padding_symbol, unk_symbol, start_symbol, end_symbol]): + if item: + self.special_symbols[item] = len(self.special_symbols) + + self.padding_symbol = padding_symbol + self.unk_symbol = unk_symbol + self.start_symbol = start_symbol + self.end_symbol = end_symbol + + + self.stoi = OrderedDict() + self.stoi.update(self.special_symbols) + N = len(self.special_symbols) + + for i, s in enumerate(symbols): + if s not in self.stoi: + self.stoi[s] = N +i + self.itos = {v: k for k, v in self.stoi.items()} + + def __len__(self): + return len(self.stoi) + + @property + def num_specials(self): + return len(self.special_symbols) + + # special tokens + @property + def padding_index(self): + return self.stoi.get(self.padding_symbol, -1) + + @property + def unk_index(self): + return self.stoi.get(self.unk_symbol, -1) + + @property + def start_index(self): + return self.stoi.get(self.start_symbol, -1) + + @property + def end_index(self): + return self.stoi.get(self.end_symbol, -1) + + def __repr__(self): + fmt = "Vocab(size: {},\nstoi:\n{})" + return fmt.format(len(self), self.stoi) + + def __str__(self): + return self.__repr__() + + def lookup(self, symbol): + return self.stoi[symbol] + + def reverse(self, index): + return self.itos[index] + + def add_symbol(self, symbol): + if symbol in self.stoi: + return + N = len(self.stoi) + self.stoi[symbol] = N + self.itos[N] = symbol + + def add_symbols(self, symbols): + for symbol in symbols: + self.add_symbol(symbol) + diff --git a/parakeet/g2p/__init__.py b/parakeet/g2p/__init__.py deleted file mode 100644 index 5840f33..0000000 --- a/parakeet/g2p/__init__.py +++ /dev/null @@ -1,32 +0,0 @@ -# coding: utf-8 -"""Text processing frontend - -All frontend module should have the following functions: - -- text_to_sequence(text, p) -- sequence_to_text(sequence) - -and the property: - -- n_vocab - -""" -from . import en - -# optinoal Japanese frontend -try: - from . import jp -except ImportError: - jp = None - -try: - from . import ko -except ImportError: - ko = None - -# if you are going to use the frontend, you need to modify _characters in symbol.py: -# _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' + '¡¿ñáéíóúÁÉÍÓÚÑ' -try: - from . import es -except ImportError: - es = None diff --git a/parakeet/g2p/en/__init__.py b/parakeet/g2p/en/__init__.py deleted file mode 100644 index 01dd223..0000000 --- a/parakeet/g2p/en/__init__.py +++ /dev/null @@ -1,34 +0,0 @@ -# coding: utf-8 - -from ..text.symbols import symbols -from ..text import sequence_to_text - -import nltk -from random import random - -n_vocab = len(symbols) - -_arpabet = nltk.corpus.cmudict.dict() - - -def _maybe_get_arpabet(word, p): - try: - phonemes = _arpabet[word][0] - phonemes = " ".join(phonemes) - except KeyError: - return word - - return '{%s}' % phonemes if random() < p else word - - -def mix_pronunciation(text, p): - text = ' '.join(_maybe_get_arpabet(word, p) for word in text.split(' ')) - return text - - -def text_to_sequence(text, p=0.0): - if p >= 0: - text = mix_pronunciation(text, p) - from ..text import text_to_sequence - text = text_to_sequence(text, ["english_cleaners"]) - return text diff --git a/parakeet/g2p/es/__init__.py b/parakeet/g2p/es/__init__.py deleted file mode 100644 index 8ac385f..0000000 --- a/parakeet/g2p/es/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# coding: utf-8 -from ..text.symbols import symbols -from ..text import sequence_to_text - -import nltk -from random import random - -n_vocab = len(symbols) - - -def text_to_sequence(text, p=0.0): - from ..text import text_to_sequence - text = text_to_sequence(text, ["basic_cleaners"]) - return text diff --git a/parakeet/g2p/jp/__init__.py b/parakeet/g2p/jp/__init__.py deleted file mode 100644 index 36c7fd8..0000000 --- a/parakeet/g2p/jp/__init__.py +++ /dev/null @@ -1,77 +0,0 @@ -# coding: utf-8 - -import MeCab -import jaconv -from random import random - -n_vocab = 0xffff - -_eos = 1 -_pad = 0 -_tagger = None - - -def _yomi(mecab_result): - tokens = [] - yomis = [] - for line in mecab_result.split("\n")[:-1]: - s = line.split("\t") - if len(s) == 1: - break - token, rest = s - rest = rest.split(",") - tokens.append(token) - yomi = rest[7] if len(rest) > 7 else None - yomi = None if yomi == "*" else yomi - yomis.append(yomi) - - return tokens, yomis - - -def _mix_pronunciation(tokens, yomis, p): - return "".join(yomis[idx] - if yomis[idx] is not None and random() < p else tokens[idx] - for idx in range(len(tokens))) - - -def mix_pronunciation(text, p): - global _tagger - if _tagger is None: - _tagger = MeCab.Tagger("") - tokens, yomis = _yomi(_tagger.parse(text)) - return _mix_pronunciation(tokens, yomis, p) - - -def add_punctuation(text): - last = text[-1] - if last not in [".", ",", "、", "。", "!", "?", "!", "?"]: - text = text + "。" - return text - - -def normalize_delimitor(text): - text = text.replace(",", "、") - text = text.replace(".", "。") - text = text.replace(",", "、") - text = text.replace(".", "。") - return text - - -def text_to_sequence(text, p=0.0): - for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】", "(", ")", "(", ")"]: - text = text.replace(c, "") - text = text.replace("!", "!") - text = text.replace("?", "?") - - text = normalize_delimitor(text) - text = jaconv.normalize(text) - if p > 0: - text = mix_pronunciation(text, p) - text = jaconv.hira2kata(text) - text = add_punctuation(text) - - return [ord(c) for c in text] + [_eos] # EOS - - -def sequence_to_text(seq): - return "".join(chr(n) for n in seq) diff --git a/parakeet/g2p/ko/__init__.py b/parakeet/g2p/ko/__init__.py deleted file mode 100644 index ccb8b5f..0000000 --- a/parakeet/g2p/ko/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# coding: utf-8 - -from random import random - -n_vocab = 0xffff - -_eos = 1 -_pad = 0 -_tagger = None - - -def text_to_sequence(text, p=0.0): - return [ord(c) for c in text] + [_eos] # EOS - - -def sequence_to_text(seq): - return "".join(chr(n) for n in seq) diff --git a/parakeet/g2p/text/__init__.py b/parakeet/g2p/text/__init__.py deleted file mode 100644 index 312b720..0000000 --- a/parakeet/g2p/text/__init__.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -from . import cleaners -from .symbols import symbols - -# Mappings from symbol to numeric ID and vice versa: -_symbol_to_id = {s: i for i, s in enumerate(symbols)} -_id_to_symbol = {i: s for i, s in enumerate(symbols)} - -# Regular expression matching text enclosed in curly braces: -_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') - - -def text_to_sequence(text, cleaner_names): - '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. - - The text can optionally have ARPAbet sequences enclosed in curly braces embedded - in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." - - Args: - text: string to convert to a sequence - cleaner_names: names of the cleaner functions to run the text through - - Returns: - List of integers corresponding to the symbols in the text - ''' - sequence = [] - - # Check for curly braces and treat their contents as ARPAbet: - while len(text): - m = _curly_re.match(text) - if not m: - sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) - break - sequence += _symbols_to_sequence( - _clean_text(m.group(1), cleaner_names)) - sequence += _arpabet_to_sequence(m.group(2)) - text = m.group(3) - - # Append EOS token - sequence.append(_symbol_to_id['~']) - return sequence - - -def sequence_to_text(sequence): - '''Converts a sequence of IDs back to a string''' - result = '' - for symbol_id in sequence: - if symbol_id in _id_to_symbol: - s = _id_to_symbol[symbol_id] - # Enclose ARPAbet back in curly braces: - if len(s) > 1 and s[0] == '@': - s = '{%s}' % s[1:] - result += s - return result.replace('}{', ' ') - - -def _clean_text(text, cleaner_names): - for name in cleaner_names: - cleaner = getattr(cleaners, name) - if not cleaner: - raise Exception('Unknown cleaner: %s' % name) - text = cleaner(text) - return text - - -def _symbols_to_sequence(symbols): - return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] - - -def _arpabet_to_sequence(text): - return _symbols_to_sequence(['@' + s for s in text.split()]) - - -def _should_keep_symbol(s): - return s in _symbol_to_id and s is not '_' and s is not '~' diff --git a/parakeet/g2p/text/cleaners.py b/parakeet/g2p/text/cleaners.py deleted file mode 100644 index 58553c1..0000000 --- a/parakeet/g2p/text/cleaners.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -''' -Cleaners are transformations that run over the input text at both training and eval time. - -Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" -hyperparameter. Some cleaners are English-specific. You'll typically want to use: - 1. "english_cleaners" for English text - 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using - the Unidecode library (https://pypi.python.org/pypi/Unidecode) - 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update - the symbols in symbols.py to match your data). -''' - -import re -from unidecode import unidecode -from .numbers import normalize_numbers - -# Regular expression matching whitespace: -_whitespace_re = re.compile(r'\s+') - -# List of (regular expression, replacement) pairs for abbreviations: -_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) - for x in [ - ('mrs', 'misess'), - ('mr', 'mister'), - ('dr', 'doctor'), - ('st', 'saint'), - ('co', 'company'), - ('jr', 'junior'), - ('maj', 'major'), - ('gen', 'general'), - ('drs', 'doctors'), - ('rev', 'reverend'), - ('lt', 'lieutenant'), - ('hon', 'honorable'), - ('sgt', 'sergeant'), - ('capt', 'captain'), - ('esq', 'esquire'), - ('ltd', 'limited'), - ('col', 'colonel'), - ('ft', 'fort'), - ]] - - -def expand_abbreviations(text): - for regex, replacement in _abbreviations: - text = re.sub(regex, replacement, text) - return text - - -def expand_numbers(text): - return normalize_numbers(text) - - -def lowercase(text): - return text.lower() - - -def collapse_whitespace(text): - return re.sub(_whitespace_re, ' ', text) - - -def convert_to_ascii(text): - return unidecode(text) - - -def add_punctuation(text): - if len(text) == 0: - return text - if text[-1] not in '!,.:;?': - text = text + '.' # without this decoder is confused when to output EOS - return text - - -def basic_cleaners(text): - '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' - text = lowercase(text) - text = collapse_whitespace(text) - return text - - -def transliteration_cleaners(text): - '''Pipeline for non-English text that transliterates to ASCII.''' - text = convert_to_ascii(text) - text = lowercase(text) - text = collapse_whitespace(text) - return text - - -def english_cleaners(text): - '''Pipeline for English text, including number and abbreviation expansion.''' - text = convert_to_ascii(text) - #text = add_punctuation(text) - text = lowercase(text) - text = expand_numbers(text) - text = expand_abbreviations(text) - text = collapse_whitespace(text) - return text diff --git a/parakeet/g2p/text/cmudict.py b/parakeet/g2p/text/cmudict.py deleted file mode 100644 index bbe7903..0000000 --- a/parakeet/g2p/text/cmudict.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - -valid_symbols = [ - 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', - 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', - 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', - 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', - 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', - 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', - 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', - 'Y', 'Z', 'ZH' -] - -_valid_symbol_set = set(valid_symbols) - - -class CMUDict: - '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' - - def __init__(self, file_or_path, keep_ambiguous=True): - if isinstance(file_or_path, str): - with open(file_or_path, encoding='latin-1') as f: - entries = _parse_cmudict(f) - else: - entries = _parse_cmudict(file_or_path) - if not keep_ambiguous: - entries = { - word: pron - for word, pron in entries.items() if len(pron) == 1 - } - self._entries = entries - - def __len__(self): - return len(self._entries) - - def lookup(self, word): - '''Returns list of ARPAbet pronunciations of the given word.''' - return self._entries.get(word.upper()) - - -_alt_re = re.compile(r'\([0-9]+\)') - - -def _parse_cmudict(file): - cmudict = {} - for line in file: - if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): - parts = line.split(' ') - word = re.sub(_alt_re, '', parts[0]) - pronunciation = _get_pronunciation(parts[1]) - if pronunciation: - if word in cmudict: - cmudict[word].append(pronunciation) - else: - cmudict[word] = [pronunciation] - return cmudict - - -def _get_pronunciation(s): - parts = s.strip().split(' ') - for part in parts: - if part not in _valid_symbol_set: - return None - return ' '.join(parts) diff --git a/parakeet/g2p/text/numbers.py b/parakeet/g2p/text/numbers.py deleted file mode 100644 index 24b5817..0000000 --- a/parakeet/g2p/text/numbers.py +++ /dev/null @@ -1,71 +0,0 @@ -# -*- coding: utf-8 -*- - -import inflect -import re - -_inflect = inflect.engine() -_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') -_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') -_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') -_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') -_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') -_number_re = re.compile(r'[0-9]+') - - -def _remove_commas(m): - return m.group(1).replace(',', '') - - -def _expand_decimal_point(m): - return m.group(1).replace('.', ' point ') - - -def _expand_dollars(m): - match = m.group(1) - parts = match.split('.') - if len(parts) > 2: - return match + ' dollars' # Unexpected format - dollars = int(parts[0]) if parts[0] else 0 - cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 - if dollars and cents: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) - elif dollars: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - return '%s %s' % (dollars, dollar_unit) - elif cents: - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s' % (cents, cent_unit) - else: - return 'zero dollars' - - -def _expand_ordinal(m): - return _inflect.number_to_words(m.group(0)) - - -def _expand_number(m): - num = int(m.group(0)) - if num > 1000 and num < 3000: - if num == 2000: - return 'two thousand' - elif num > 2000 and num < 2010: - return 'two thousand ' + _inflect.number_to_words(num % 100) - elif num % 100 == 0: - return _inflect.number_to_words(num // 100) + ' hundred' - else: - return _inflect.number_to_words( - num, andword='', zero='oh', group=2).replace(', ', ' ') - else: - return _inflect.number_to_words(num, andword='') - - -def normalize_numbers(text): - text = re.sub(_comma_number_re, _remove_commas, text) - text = re.sub(_pounds_re, r'\1 pounds', text) - text = re.sub(_dollars_re, _expand_dollars, text) - text = re.sub(_decimal_number_re, _expand_decimal_point, text) - text = re.sub(_ordinal_re, _expand_ordinal, text) - text = re.sub(_number_re, _expand_number, text) - return text diff --git a/parakeet/g2p/text/symbols.py b/parakeet/g2p/text/symbols.py deleted file mode 100644 index 299ca58..0000000 --- a/parakeet/g2p/text/symbols.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -''' -Defines the set of symbols used in text input to the model. - -The default is a set of ASCII characters that works well for English or text that has been run -through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. -''' -from .cmudict import valid_symbols - -_pad = '_' -_eos = '~' -_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' - -# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): -_arpabet = ['@' + s for s in valid_symbols] - -# Export all symbols: -symbols = [_pad, _eos] + list(_characters) + _arpabet