From fb64c79f7a906073279fa1ec7c2a53c329285d9c Mon Sep 17 00:00:00 2001 From: lfchener Date: Thu, 10 Dec 2020 07:05:40 +0000 Subject: [PATCH] add normalize function in normalizer.py --- parakeet/frontend/normalizer/normalizer.py | 32 ++++++++++++++++++++++ parakeet/frontend/phonectic.py | 24 +++------------- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/parakeet/frontend/normalizer/normalizer.py b/parakeet/frontend/normalizer/normalizer.py index e69de29..96981f8 100644 --- a/parakeet/frontend/normalizer/normalizer.py +++ b/parakeet/frontend/normalizer/normalizer.py @@ -0,0 +1,32 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import unicodedata +from builtins import str as unicode +from parakeet.frontend.normalizer.numbers import normalize_numbers + + +def normalize(sentence): + # preprocessing + text = unicode(text) + text = normalize_numbers(text) + text = ''.join( + char for char in unicodedata.normalize('NFD', text) + if unicodedata.category(char) != 'Mn') # Strip accents + text = text.lower() + text = re.sub(r"[^ a-z'.,?!\-]", "", text) + text = text.replace("i.e.", "that is") + text = text.replace("e.g.", "for example") + return text diff --git a/parakeet/frontend/phonectic.py b/parakeet/frontend/phonectic.py index f814681..50cf432 100644 --- a/parakeet/frontend/phonectic.py +++ b/parakeet/frontend/phonectic.py @@ -16,13 +16,10 @@ from abc import ABC, abstractmethod from typing import Union from g2p_en import G2p from g2pM import G2pM -import re -import unicodedata -from builtins import str as unicode from parakeet.frontend import Vocab from opencc import OpenCC from parakeet.frontend.punctuation import get_punctuations -from parakeet.frontend.normalizer.numbers import normalize_numbers +from parakeet.frontend.normalizer.normalizer import normalize __all__ = ["Phonetics", "English", "EnglishCharacter", "Chinese"] @@ -77,29 +74,16 @@ class English(Phonetics): class EnglishCharacter(Phonetics): def __init__(self): self.backend = G2p() - self.phonemes = list(self.backend.graphemes) + self.graphemes = list(self.backend.graphemes) self.punctuations = get_punctuations("en") - self.vocab = Vocab(self.phonemes + self.punctuations) - - def _prepocessing(self, text): - # preprocessing - text = unicode(text) - text = normalize_numbers(text) - text = ''.join( - char for char in unicodedata.normalize('NFD', text) - if unicodedata.category(char) != 'Mn') # Strip accents - text = text.lower() - text = re.sub(r"[^ a-z'.,?!\-]", "", text) - text = text.replace("i.e.", "that is") - text = text.replace("e.g.", "for example") - return text + self.vocab = Vocab(self.graphemes + self.punctuations) def phoneticize(self, sentence): start = self.vocab.start_symbol end = self.vocab.end_symbol chars = ([] if start is None else [start]) \ - + _prepocessing(sentence) \ + + normalize(sentence) \ + ([] if end is None else [end]) return chars