From b674f63d747a80a95cde6a5c293f6f10c1bc6193 Mon Sep 17 00:00:00 2001
From: iclementine <chenfeiyu@baidu.com>
Date: Thu, 8 Apr 2021 04:59:29 +0800
Subject: [PATCH] add 2 frontend

---
 parakeet/frontend/arpabet.py | 286 +++++++++++++++++++++++++++++++
 parakeet/frontend/pinyin.py  | 320 +++++++++++++++++++++++++++++++++++
 2 files changed, 606 insertions(+)
 create mode 100644 parakeet/frontend/arpabet.py
 create mode 100644 parakeet/frontend/pinyin.py

diff --git a/parakeet/frontend/arpabet.py b/parakeet/frontend/arpabet.py
new file mode 100644
index 0000000..f683462
--- /dev/null
+++ b/parakeet/frontend/arpabet.py
@@ -0,0 +1,286 @@
+from parakeet.frontend.phonectic import Phonetics
+"""
+A phonology system with ARPABET symbols and limited punctuations. The G2P 
+conversion is done by g2p_en.
+
+Note that g2p_en does not handle words with hypen well. So make sure the input
+sentence is first normalized.
+"""
+from parakeet.frontend.vocab import Vocab
+from g2p_en import G2p
+
+
+
+class ARPABET(Phonetics):
+    """A phonology for English that uses ARPABET as the phoneme vocabulary.
+    See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details.
+    Phoneme Example Translation
+        ------- ------- -----------
+        AA	odd     AA D
+        AE	at	AE T
+        AH	hut	HH AH T
+        AO	ought	AO T
+        AW	cow	K AW
+        AY	hide	HH AY D
+        B 	be	B IY
+        CH	cheese	CH IY Z
+        D 	dee	D IY
+        DH	thee	DH IY
+        EH	Ed	EH D
+        ER	hurt	HH ER T
+        EY	ate	EY T
+        F 	fee	F IY
+        G 	green	G R IY N
+        HH	he	HH IY
+        IH	it	IH T
+        IY	eat	IY T
+        JH	gee	JH IY
+        K 	key	K IY
+        L 	lee	L IY
+        M 	me	M IY
+        N 	knee	N IY
+        NG	ping	P IH NG
+        OW	oat	OW T
+        OY	toy	T OY
+        P 	pee	P IY
+        R 	read	R IY D
+        S 	sea	S IY
+        SH	she	SH IY
+        T 	tea	T IY
+        TH	theta	TH EY T AH
+        UH	hood	HH UH D
+        UW	two	T UW
+        V 	vee	V IY
+        W 	we	W IY
+        Y 	yield	Y IY L D
+        Z 	zee	Z IY
+        ZH	seizure	S IY ZH ER
+    """
+    phonemes = [
+        'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER',
+        'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW',
+        'OY', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UW', 'UH', 'V', 'W', 'Y', 'Z',
+        'ZH'
+    ]
+    punctuations = [',', '.', '?', '!']
+    symbols = phonemes + punctuations
+    _stress_to_no_stress_ = {
+        'AA0': 'AA',
+        'AA1': 'AA',
+        'AA2': 'AA',
+        'AE0': 'AE',
+        'AE1': 'AE',
+        'AE2': 'AE',
+        'AH0': 'AH',
+        'AH1': 'AH',
+        'AH2': 'AH',
+        'AO0': 'AO',
+        'AO1': 'AO',
+        'AO2': 'AO',
+        'AW0': 'AW',
+        'AW1': 'AW',
+        'AW2': 'AW',
+        'AY0': 'AY',
+        'AY1': 'AY',
+        'AY2': 'AY',
+        'EH0': 'EH',
+        'EH1': 'EH',
+        'EH2': 'EH',
+        'ER0': 'ER',
+        'ER1': 'ER',
+        'ER2': 'ER',
+        'EY0': 'EY',
+        'EY1': 'EY',
+        'EY2': 'EY',
+        'IH0': 'IH',
+        'IH1': 'IH',
+        'IH2': 'IH',
+        'IY0': 'IY',
+        'IY1': 'IY',
+        'IY2': 'IY',
+        'OW0': 'OW',
+        'OW1': 'OW',
+        'OW2': 'OW',
+        'OY0': 'OY',
+        'OY1': 'OY',
+        'OY2': 'OY',
+        'UH0': 'UH',
+        'UH1': 'UH',
+        'UH2': 'UH',
+        'UW0': 'UW',
+        'UW1': 'UW',
+        'UW2': 'UW'
+    }
+
+    def __init__(self):
+        self.backend = G2p()
+        self.vocab = Vocab(self.phonemes + self.punctuations)
+
+    def _remove_vowels(self, phone):
+        return self._stress_to_no_stress_.get(phone, phone)
+
+    def phoneticize(self, sentence, add_start_end=False):
+        """ Normalize the input text sequence and convert it into pronunciation sequence.
+    
+        Parameters
+        -----------
+        sentence: str
+            The input text sequence.
+    
+        Returns
+        ----------
+        List[str]
+            The list of pronunciation sequence.
+        """
+        phonemes = [self._remove_vowels(item) for item in self.backend(sentence)]
+        if add_start_end:
+            start = self.vocab.start_symbol
+            end = self.vocab.end_symbol
+            phonemes = [start] + phonemes + [end]
+        phonemes = [item for item in phonemes if item in self.vocab.stoi]
+        return phonemes
+
+    def numericalize(self, phonemes):
+        """ Convert pronunciation sequence into pronunciation id sequence.
+        
+        Parameters
+        -----------
+        phonemes: List[str]
+            The list of pronunciation sequence.
+    
+        Returns
+        ----------
+        List[int]
+            The list of pronunciation id sequence.
+        """
+        ids = [self.vocab.lookup(item) for item in phonemes]
+        return ids
+
+    def reverse(self, ids):
+        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
+        
+        Parameters
+        -----------
+        ids: List[int]
+            The list of pronunciation id sequence.
+    
+        Returns
+        ----------
+        List[str]
+            The list of pronunciation sequence.
+        """
+        return [self.vocab.reverse(i) for i in ids]
+
+    def __call__(self, sentence, add_start_end=False):
+        """ Convert the input text sequence into pronunciation id sequence.
+    
+        Parameters
+        -----------
+        sentence: str
+            The input text sequence.
+    
+        Returns
+        ----------
+        List[str]
+            The list of pronunciation id sequence.
+        """
+        return self.numericalize(self.phoneticize(sentence, add_start_end=add_start_end))
+
+    @property
+    def vocab_size(self):
+        """ Vocab size.
+        """
+        # 47 = 39 phones + 4 punctuations + 4 special tokens
+        return len(self.vocab)
+
+
+class ARPABETWithStress(Phonetics):
+    phonemes = [
+        'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
+        'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D',
+        'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2',
+        'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K',
+        'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P',
+        'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2',
+        'V', 'W', 'Y', 'Z', 'ZH'
+    ]
+    punctuations = [',', '.', '?', '!']
+    symbols = phonemes + punctuations
+    
+    def __init__(self):
+        self.backend = G2p()
+        self.vocab = Vocab(self.phonemes + self.punctuations)
+
+    def phoneticize(self, sentence, add_start_end=False):
+        """ Normalize the input text sequence and convert it into pronunciation sequence.
+    
+        Parameters
+        -----------
+        sentence: str
+            The input text sequence.
+    
+        Returns
+        ----------
+        List[str]
+            The list of pronunciation sequence.
+        """
+        phonemes = self.backend(sentence)
+        if add_start_end:
+            start = self.vocab.start_symbol
+            end = self.vocab.end_symbol
+            phonemes = [start] + phonemes + [end]
+        phonemes = [item for item in phonemes if item in self.vocab.stoi]
+        return phonemes
+
+    def numericalize(self, phonemes):
+        """ Convert pronunciation sequence into pronunciation id sequence.
+        
+        Parameters
+        -----------
+        phonemes: List[str]
+            The list of pronunciation sequence.
+    
+        Returns
+        ----------
+        List[int]
+            The list of pronunciation id sequence.
+        """
+        ids = [self.vocab.lookup(item) for item in phonemes]
+        return ids
+
+    def reverse(self, ids):
+        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
+        
+        Parameters
+        -----------
+        ids: List[int]
+            The list of pronunciation id sequence.
+    
+        Returns
+        ----------
+        List[str]
+            The list of pronunciation sequence.
+        """
+        return [self.vocab.reverse(i) for i in ids]
+
+    def __call__(self, sentence, add_start_end=False):
+        """ Convert the input text sequence into pronunciation id sequence.
+    
+        Parameters
+        -----------
+        sentence: str
+            The input text sequence.
+    
+        Returns
+        ----------
+        List[str]
+            The list of pronunciation id sequence.
+        """
+        return self.numericalize(self.phoneticize(sentence, add_start_end=add_start_end))
+
+    @property
+    def vocab_size(self):
+        """ Vocab size.
+        """
+        # 77 = 69 phones + 4 punctuations + 4 special tokens
+        return len(self.vocab)    
diff --git a/parakeet/frontend/pinyin.py b/parakeet/frontend/pinyin.py
new file mode 100644
index 0000000..de3680b
--- /dev/null
+++ b/parakeet/frontend/pinyin.py
@@ -0,0 +1,320 @@
+"""
+A Simple Chinese Phonology using pinyin symbols. 
+The G2P conversion converts pinyin string to symbols. Also it can handle string
+in Chinese chracters, but due to the complexity of chinese G2P, we can leave 
+text -> pinyin to other part of a TTS system. Other NLP techniques may be used
+(e.g. tokenization, tagging, NER...)
+"""
+import re
+from parakeet.frontend.phonectic import Phonetics
+from parakeet.frontend.vocab import Vocab
+import pypinyin
+from pypinyin.core import Pinyin, Style
+from pypinyin.core import DefaultConverter
+from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
+from itertools import product
+
+_punctuations = ['，', '。', '？', '！']
+_initials = [
+    'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 'q', 'x', 'zh',
+    'ch', 'sh', 'r', 'z', 'c', 's'
+]
+_finals = [
+    'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en',
+    'ang', 'eng', 'er', 'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian',
+    'ien', 'iang', 'ieng', 'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang',
+    'ueng', 'v', 've', 'van', 'ven', 'veng'
+]
+_ernized_symbol = ['&r']
+_phones = _initials + _finals + _ernized_symbol + _punctuations
+_tones = ['0', '1', '2', '3', '4', '5']
+
+_toned_finals = [final + tone for final, tone in product(_finals, _tones[1:])]
+_toned_phonems = _initials + _toned_finals + _ernized_symbol + _punctuations
+
+class ParakeetConverter(NeutralToneWith5Mixin, DefaultConverter):
+    pass
+
+
+class ParakeetPinyin(Phonetics):
+    def __init__(self):
+        self.vocab_phonemes = Vocab(_phones)
+        self.vocab_tones = Vocab(_tones)
+        self.pinyin_backend = Pinyin(ParakeetConverter())
+        
+    def convert_pypinyin_tone3(self, syllables, add_start_end=False):
+        phonemes, tones = _convert_to_parakeet_style_pinyin(syllables)
+
+        if add_start_end:
+            start = self.vocab_phonemes.start_symbol
+            end = self.vocab_phonemes.end_symbol
+            phonemes = [start] + phonemes + [end]
+
+            start = self.vocab_tones.start_symbol
+            end = self.vocab_tones.end_symbol
+            phonemes = [start] + tones + [end]
+
+        phonemes = [
+            item for item in phonemes if item in self.vocab_phonemes.stoi
+        ]
+        tones = [item for item in tones if item in self.vocab_tones.stoi]
+        return phonemes, tones        
+        
+
+    def phoneticize(self, sentence, add_start_end=False):
+        """ Normalize the input text sequence and convert it into pronunciation sequence.
+    
+        Parameters
+        -----------
+        sentence: str
+            The input text sequence.
+    
+        Returns
+        ----------
+        List[str]
+            The list of pronunciation sequence.
+        """
+        syllables = self.pinyin_backend.lazy_pinyin(sentence,
+                                                    style=Style.TONE3,
+                                                    strict=True)
+        phonemes, tones = self.convert_pypinyin_tone3(syllables, add_start_end=add_start_end)
+        return phonemes, tones
+
+    def numericalize(self, phonemes, tones):
+        """ Convert pronunciation sequence into pronunciation id sequence.
+        
+        Parameters
+        -----------
+        phonemes: List[str]
+            The list of pronunciation sequence.
+    
+        Returns
+        ----------
+        List[int]
+            The list of pronunciation id sequence.
+        """
+        phoneme_ids = [self.vocab_phonemes.lookup(item) for item in phonemes]
+        tone_ids = [self.vocab_tones.lookup(item) for item in tones]
+        return phoneme_ids, tone_ids
+
+    def __call__(self, sentence, add_start_end=False):
+        """ Convert the input text sequence into pronunciation id sequence.
+    
+        Parameters
+        -----------
+        sentence: str
+            The input text sequence.
+    
+        Returns
+        ----------
+        List[str]
+            The list of pronunciation id sequence.
+        """
+        phonemes, tones = self.phoneticize(sentence,
+                                           add_start_end=add_start_end)
+        phoneme_ids, tone_ids = self.numericalize(phonemes, tones)
+        return phoneme_ids, tone_ids
+
+    @property
+    def vocab_size(self):
+        """ Vocab size.
+        """
+        # 70 = 62 phones + 4 punctuations + 4 special tokens
+        return len(self.vocab_phonemes)
+
+    @property
+    def tone_vocab_size(self):
+        # 10 = 1 non tone + 5 tone + 4 special tokens
+        return len(self.vocab_tones)
+
+
+
+class ParakeetPinyinWithTone(Phonetics):
+    def __init__(self):
+        self.vocab = Vocab(_toned_phonems)
+        self.pinyin_backend = Pinyin(ParakeetConverter())
+        
+    def convert_pypinyin_tone3(self, syllables, add_start_end=False):
+        phonemes = _convert_to_parakeet_style_pinyin_with_tone(syllables)
+
+        if add_start_end:
+            start = self.vocab_phonemes.start_symbol
+            end = self.vocab_phonemes.end_symbol
+            phonemes = [start] + phonemes + [end]
+
+        phonemes = [
+            item for item in phonemes if item in self.vocab.stoi
+        ]
+        return phonemes
+    
+    def phoneticize(self, sentence, add_start_end=False):
+        """ Normalize the input text sequence and convert it into pronunciation sequence.
+    
+        Parameters
+        -----------
+        sentence: str
+            The input text sequence.
+    
+        Returns
+        ----------
+        List[str]
+            The list of pronunciation sequence.
+        """
+        syllables = self.pinyin_backend.lazy_pinyin(sentence,
+                                                    style=Style.TONE3,
+                                                    strict=True)
+        phonemes = self.convert_pypinyin_tone3(syllables, add_start_end=add_start_end)
+        return phonemes
+
+    def numericalize(self, phonemes):
+        """ Convert pronunciation sequence into pronunciation id sequence.
+        
+        Parameters
+        -----------
+        phonemes: List[str]
+            The list of pronunciation sequence.
+    
+        Returns
+        ----------
+        List[int]
+            The list of pronunciation id sequence.
+        """
+        phoneme_ids = [self.vocab.lookup(item) for item in phonemes]
+        return phoneme_ids
+
+    def __call__(self, sentence, add_start_end=False):
+        """ Convert the input text sequence into pronunciation id sequence.
+    
+        Parameters
+        -----------
+        sentence: str
+            The input text sequence.
+    
+        Returns
+        ----------
+        List[str]
+            The list of pronunciation id sequence.
+        """
+        phonemes = self.phoneticize(sentence, add_start_end=add_start_end)
+        phoneme_ids = self.numericalize(phonemes)
+        return phoneme_ids
+
+    @property
+    def vocab_size(self):
+        """ Vocab size.
+        """
+        # 230 = 222 phones + 4 punctuations + 4 special tokens
+        return len(self.vocab)
+
+
+def _convert_to_parakeet_convension(syllable):
+    # from pypinyin.Style.TONE3 to parakeet convension
+    tone = syllable[-1]
+    syllable = syllable[:-1]
+
+    # expansion of o -> uo
+    syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
+
+    # expansion for iong, ong
+    syllable = syllable.replace("iong", "veng").replace("ong", "ueng")
+
+    # expansion for ing, in
+    syllable = syllable.replace("ing", "ieng").replace("in", "ien")
+
+    # expansion for un, ui, iu
+    syllable = syllable.replace("un","uen")\
+        .replace("ui", "uei")\
+        .replace("iu", "iou")
+
+    # rule for variants of i
+    syllable = syllable.replace("zi", "zii")\
+        .replace("ci", "cii")\
+        .replace("si", "sii")\
+        .replace("zhi", "zhiii")\
+        .replace("chi", "chiii")\
+        .replace("shi", "shiii")\
+        .replace("ri", "riii")
+
+    # rule for y preceding i, u
+    syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
+
+    # rule for w
+    syllable = syllable.replace("wu", "u").replace("w", "u")
+
+    # rule for v following j, q, x
+    syllable = syllable.replace("ju", "jv")\
+        .replace("qu", "qv")\
+        .replace("xu", "xv")
+
+    return syllable + tone
+
+
+def _split_syllable(syllable: str):
+    global _punctuations
+
+    if syllable in _punctuations:
+        # syllables, tones
+        return [syllable], ['0']
+
+    syllable = _convert_to_parakeet_convension(syllable)
+
+    tone = syllable[-1]
+    syllable = syllable[:-1]
+
+    phones = []
+    tones = []
+
+    global _initials
+    if syllable[:2] in _initials:
+        phones.append(syllable[:2])
+        tones.append('0')
+        phones.append(syllable[2:])
+        tones.append(tone)
+    elif syllable[0] in _initials:
+        phones.append(syllable[0])
+        tones.append('0')
+        phones.append(syllable[1:])
+        tones.append(tone)
+    else:
+        phones.append(syllable)
+        tones.append(tone)
+    return phones, tones
+
+
+def _convert_to_parakeet_style_pinyin(syllables):
+    phones, tones = [], []
+    for syllable in syllables:
+        p, t = _split_syllable(syllable)
+        phones.extend(p)
+        tones.extend(t)
+    return phones, tones
+
+def _split_syllable_with_tone(syllable: str):
+    global _punctuations
+
+    if syllable in _punctuations:
+        # syllables
+        return [syllable]
+
+    syllable = _convert_to_parakeet_convension(syllable)
+
+    phones = []
+
+    global _initials
+    if syllable[:2] in _initials:
+        phones.append(syllable[:2])
+        phones.append(syllable[2:])
+    elif syllable[0] in _initials:
+        phones.append(syllable[0])
+        phones.append(syllable[1:])
+    else:
+        phones.append(syllable)
+    return phones
+
+def _convert_to_parakeet_style_pinyin_with_tone(syllables):
+    phones = []
+    for syllable in syllables:
+        p = _split_syllable_with_tone(syllable)
+        phones.extend(p)
+    return phones
+