diff --git a/examples/fastspeech2/baker/frontend.py b/examples/fastspeech2/baker/frontend.py index c8b848b..3ed7efb 100644 --- a/examples/fastspeech2/baker/frontend.py +++ b/examples/fastspeech2/baker/frontend.py @@ -48,9 +48,7 @@ class Frontend(): tone_ids = [self.vocab_tones[item] for item in tones] return np.array(tone_ids, np.int64) - def get_input_ids(self, sentence, get_tone_ids=False): - phonemes = self.frontend.get_phonemes(sentence) - result = {} + def _get_phone_tone(self, phonemes, get_tone_ids=False): phones = [] tones = [] if get_tone_ids and self.vocab_tones: @@ -76,12 +74,7 @@ class Frontend(): else: phones.append(full_phone) tones.append('0') - tone_ids = self._t2id(tones) - tone_ids = paddle.to_tensor(tone_ids) - result["tone_ids"] = tone_ids else: - - phones = [] for phone in phonemes: # if the merged erhua not in the vocab # assume that the input is ['iaor3'] and 'iaor' not in self.vocab_phones, change ['iaor3'] to ['iao3','er2'] @@ -92,8 +85,30 @@ class Frontend(): phones.append("er2") else: phones.append(phone) + return phones, tones - phone_ids = self._p2id(phones) - phone_ids = paddle.to_tensor(phone_ids) - result["phone_ids"] = phone_ids + def get_input_ids(self, sentence, merge_sentences=True, + get_tone_ids=False): + phonemes = self.frontend.get_phonemes( + sentence, merge_sentences=merge_sentences) + result = {} + phones = [] + tones = [] + temp_phone_ids = [] + temp_tone_ids = [] + for part_phonemes in phonemes: + phones, tones = self._get_phone_tone( + part_phonemes, get_tone_ids=get_tone_ids) + if tones: + tone_ids = self._t2id(tones) + tone_ids = paddle.to_tensor(tone_ids) + temp_tone_ids.append(tone_ids) + if phones: + phone_ids = self._p2id(phones) + phone_ids = paddle.to_tensor(phone_ids) + temp_phone_ids.append(phone_ids) + if temp_tone_ids: + result["tone_ids"] = temp_tone_ids + if temp_phone_ids: + result["phone_ids"] = temp_phone_ids return result diff --git a/examples/fastspeech2/baker/synthesize_e2e.py b/examples/fastspeech2/baker/synthesize_e2e.py index 8d57e2a..8873ab7 100644 --- a/examples/fastspeech2/baker/synthesize_e2e.py +++ b/examples/fastspeech2/baker/synthesize_e2e.py @@ -72,19 +72,25 @@ def evaluate(args, fastspeech2_config, pwg_config): std = paddle.to_tensor(std) pwg_normalizer = ZScore(mu, std) - fastspeech2_inferencce = FastSpeech2Inference(fastspeech2_normalizer, - model) + fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model) pwg_inference = PWGInference(pwg_normalizer, vocoder) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) for utt_id, sentence in sentences: - input_ids = frontend.get_input_ids(sentence) + input_ids = frontend.get_input_ids(sentence, merge_sentences=True) phone_ids = input_ids["phone_ids"] - with paddle.no_grad(): - mel = fastspeech2_inferencce(phone_ids) - wav = pwg_inference(mel) + flags = 0 + for part_phone_ids in phone_ids: + with paddle.no_grad(): + mel = fastspeech2_inference(part_phone_ids) + temp_wav = pwg_inference(mel) + if flags == 0: + wav = temp_wav + flags = 1 + else: + wav = paddle.concat([wav, temp_wav]) sf.write( str(output_dir / (utt_id + ".wav")), wav.numpy(), diff --git a/parakeet/frontend/cn_frontend.py b/parakeet/frontend/cn_frontend.py index 12b2b84..62172f0 100644 --- a/parakeet/frontend/cn_frontend.py +++ b/parakeet/frontend/cn_frontend.py @@ -116,7 +116,9 @@ class Frontend(): phones.append('sp') phones_list.append(phones) if merge_sentences: - phones_list = sum(phones_list, []) + merge_list = sum(phones_list, []) + phones_list = [] + phones_list.append(merge_list) return phones_list def _merge_erhua(self, initials, finals, word, pos): @@ -136,7 +138,8 @@ class Frontend(): new_initials.append(initials[i]) return new_initials, new_finals - def get_phonemes(self, sentence, with_erhua=True): + def get_phonemes(self, sentence, merge_sentences=True, with_erhua=True): sentences = self.text_normalizer.normalize(sentence) - phonemes = self._g2p(sentences, with_erhua=with_erhua) + phonemes = self._g2p( + sentences, merge_sentences=merge_sentences, with_erhua=with_erhua) return phonemes