add e2e inference script

2021-07-13 10:51:17 +08:00 · 2021-07-13 10:51:17 +08:00 · a62eeb9b06
parent acc02c9b79
commit a62eeb9b06
4 changed files with 259 additions and 1 deletions
--- a/examples/speedyspeech/baker/frontend.py
+++ b/examples/speedyspeech/baker/frontend.py
@ -0,0 +1,92 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
 import numpy as np
 import paddle
 import pypinyin
 from pypinyin import lazy_pinyin, Style
 import jieba
 import phkit
 phkit.initialize()
 from parakeet.frontend.vocab import Vocab
 with open("phones.txt", 'rt') as f:
    phones = [line.strip() for line in f.readlines()]
 with open("tones.txt", 'rt') as f:
    tones = [line.strip() for line in f.readlines()]
 voc_phones = Vocab(phones, start_symbol=None, end_symbol=None)
 voc_tones = Vocab(tones, start_symbol=None, end_symbol=None)
 def segment(sentence):
    segments = re.split(r'[：，；。？！]', sentence)
    segments = [seg for seg in segments if len(seg)]
    return segments
 def g2p(sentence):
    segments = segment(sentence)
    phones = []
    phones.append('sil')
    tones = []
    tones.append('0')
    for seg in segments:
        seg = jieba.lcut(seg)
        initials = lazy_pinyin(
            seg, neutral_tone_with_five=True, style=Style.INITIALS)
        finals = lazy_pinyin(
            seg, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
        for c, v in zip(initials, finals):
            # NOTE: post process for pypinyin outputs
            # we discriminate i, ii and iii
            if re.match(r'i\d', v):
                if c in ['z', 'c', 's']:
                    v = re.sub('i', 'ii', v)
                elif c in ['zh', 'ch', 'sh', 'r']:
                    v = re.sub('i', 'iii', v)
            if c:
                phones.append(c)
                tones.append('0')
            if v:
                phones.append(v[:-1])
                tones.append(v[-1])
        phones.append('sp')
        tones.append('0')
    phones[-1] = 'sil'
    tones[-1] = '0'
    return (phones, tones)
 def p2id(voc, phonemes):
    phone_ids = [voc.lookup(item) for item in phonemes]
    return np.array(phone_ids, np.int64)
 def t2id(voc, tones):
    tone_ids = [voc.lookup(item) for item in tones]
    return np.array(tone_ids, np.int64)
 def text_analysis(sentence):
    phonemes, tones = g2p(sentence)
    print(sentence)
    print([p + t if t != '0' else p for p, t in zip(phonemes, tones)])
    phone_ids = p2id(voc_phones, phonemes)
    tone_ids = t2id(voc_tones, tones)
    phones = paddle.to_tensor(phone_ids)
    tones = paddle.to_tensor(tone_ids)
    return phones, tones
--- a/examples/speedyspeech/baker/sentences.txt
+++ b/examples/speedyspeech/baker/sentences.txt
@ -0,0 +1,16 @@
 001 凯莫瑞安联合体的经济崩溃，迫在眉睫。
 002 对于所有想要离开那片废土，去寻找更美好生活的人来说。
 003 克哈，是你们所有人安全的港湾。
 004 为了保护尤摩扬人民不受异虫的残害，我所做的，比他们自己的领导委员会都多。
 005 无论他们如何诽谤我，我将继续为所有泰伦人的最大利益，而努力奋斗。
 006 身为你们的元首，我带领泰伦人实现了人类统治领地和经济的扩张。
 007 我们将继续成长，用行动回击那些只会说风凉话，不愿意和我们相向而行的害群之马。
 008 帝国武装力量，无数的优秀儿女，正时刻守卫着我们的家园大门，但是他们孤木难支。
 009 凡是今天应征入伍者，所获的所有刑罚罪责，减半。
 010 激进分子和异见者希望你们一听见枪声，就背弃多年的和平与繁荣。
 011 他们没有勇气和能力，带领人类穿越一个充满危险的星系。
 012 法治是我们的命脉，然而它却受到前所未有的挑战。
 013 我将恢复我们帝国的荣光，绝不会向任何外星势力低头。
 014 我已经驯服了异虫，荡平了星灵。如今它们的创造者，想要夺走我们拥有的一切。
 015 永远记住，谁才是最能保护你们的人。
 016 不要听信别人的谗言，我不是什么克隆人。
--- a/examples/speedyspeech/baker/synthesize.py
+++ b/examples/speedyspeech/baker/synthesize.py
@ -121,7 +121,7 @@ def main():
        type=str,
        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
    )
-    parser.add_argument("--test-metadata", type=str, help="training data")
+    parser.add_argument("--test-metadata", type=str, help="test metadata")
    parser.add_argument("--output-dir", type=str, help="output dir")
    parser.add_argument(
        "--device", type=str, default="gpu", help="device type to use")
--- a/examples/speedyspeech/baker/synthesize_e2e.py
+++ b/examples/speedyspeech/baker/synthesize_e2e.py
@ -0,0 +1,150 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import sys
 import logging
 import argparse
 import dataclasses
 from pathlib import Path
 import yaml
 import jsonlines
 import paddle
 import numpy as np
 import soundfile as sf
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from paddle import distributed as dist
 from yacs.config import CfgNode
 from parakeet.datasets.data_table import DataTable
 from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference
 from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
 from parakeet.modules.normalizer import ZScore
 from frontend import text_analysis
 def evaluate(args, speedyspeech_config, pwg_config):
    # dataloader has been too verbose
    logging.getLogger("DataLoader").disabled = True
    # construct dataset for evaluation
    sentences = []
    with open(args.text, 'rt') as f:
        for line in f:
            utt_id, sentence = line.strip().split()
            sentences.append((utt_id, sentence))
    model = SpeedySpeech(**speedyspeech_config["model"])
    model.set_state_dict(
        paddle.load(args.speedyspeech_checkpoint)["main_params"])
    model.eval()
    vocoder = PWGGenerator(**pwg_config["generator_params"])
    vocoder.set_state_dict(paddle.load(args.pwg_params))
    vocoder.remove_weight_norm()
    vocoder.eval()
    print("model done!")
    stat = np.load(args.speedyspeech_stat)
    mu, std = stat
    mu = paddle.to_tensor(mu)
    std = paddle.to_tensor(std)
    speedyspeech_normalizer = ZScore(mu, std)
    stat = np.load(args.pwg_stat)
    mu, std = stat
    mu = paddle.to_tensor(mu)
    std = paddle.to_tensor(std)
    pwg_normalizer = ZScore(mu, std)
    speedyspeech_inferencce = SpeedySpeechInference(speedyspeech_normalizer,
                                                    model)
    pwg_inference = PWGInference(pwg_normalizer, vocoder)
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    for utt_id, sentence in sentences:
        phones, tones = text_analysis(sentence)
        with paddle.no_grad():
            wav = pwg_inference(speedyspeech_inferencce(phones, tones))
        sf.write(
            output_dir / (utt_id + ".wav"),
            wav.numpy(),
            samplerate=speedyspeech_config.sr)
        print(f"{utt_id} done!")
 def main():
    # parse args and config and redirect to train_sp
    parser = argparse.ArgumentParser(
        description="Synthesize with speedyspeech & parallel wavegan.")
    parser.add_argument(
        "--speedyspeech-config",
        type=str,
        help="config file to overwrite default config")
    parser.add_argument(
        "--speedyspeech-checkpoint",
        type=str,
        help="speedyspeech checkpoint to load.")
    parser.add_argument(
        "--speedyspeech-stat",
        type=str,
        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
    )
    parser.add_argument(
        "--pwg-config",
        type=str,
        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
    )
    parser.add_argument(
        "--pwg-params",
        type=str,
        help="parallel wavegan generator parameters to load.")
    parser.add_argument(
        "--pwg-stat",
        type=str,
        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
    )
    parser.add_argument(
        "--text",
        type=str,
        help="text to synthesize, a 'utt_id sentence' pair per line")
    parser.add_argument("--output-dir", type=str, help="output dir")
    parser.add_argument(
        "--device", type=str, default="gpu", help="device type to use")
    parser.add_argument("--verbose", type=int, default=1, help="verbose")
    args = parser.parse_args()
    with open(args.speedyspeech_config) as f:
        speedyspeech_config = CfgNode(yaml.safe_load(f))
    with open(args.pwg_config) as f:
        pwg_config = CfgNode(yaml.safe_load(f))
    print("========Args========")
    print(yaml.safe_dump(vars(args)))
    print("========Config========")
    print(speedyspeech_config)
    print(pwg_config)
    evaluate(args, speedyspeech_config, pwg_config)
 if __name__ == "__main__":
    main()