2020-07-10 20:22:43 +08:00
import numpy as np
from matplotlib import cm
import librosa
import os
import time
import tqdm
import argparse
from ruamel import yaml
import paddle
from paddle import fluid
from paddle.fluid import layers as F
from paddle.fluid import dygraph as dg
from paddle.fluid.io import DataLoader
import soundfile as sf
from parakeet.data import SliceDataset, DataCargo, PartialyRandomizedSimilarTimeLengthSampler, SequentialSampler
from parakeet.utils.io import save_parameters, load_parameters, add_yaml_config_to_args
from parakeet.g2p import en
2020-07-17 17:56:23 +08:00
from parakeet.models.deepvoice3.weight_norm_hook import remove_weight_norm
2020-07-13 15:19:52 +08:00
from vocoder import WaveflowVocoder, GriffinLimVocoder
2020-07-10 20:22:43 +08:00
from train import create_model
def main(args, config):
model = create_model(config)
loaded_step = load_parameters(model, checkpoint_path=args.checkpoint)
2020-07-17 17:56:23 +08:00
for name, layer in model.named_sublayers():
except ValueError:
# this layer has not weight norm hook
2020-07-10 20:22:43 +08:00
2020-07-13 15:19:52 +08:00
if args.vocoder == "waveflow":
vocoder = WaveflowVocoder()
elif args.vocoder == "griffin-lim":
vocoder = GriffinLimVocoder(
raise ValueError("Other vocoders are not supported.")
2020-07-10 20:22:43 +08:00
if not os.path.exists(args.output):
monotonic_layers = [int(item.strip()) - 1 for item in args.monotonic_layers.split(',')]
with open(args.input, 'rt') as f:
sentences = [line.strip() for line in f.readlines()]
for i, sentence in enumerate(sentences):
2020-07-13 15:19:52 +08:00
wav = synthesize(args, config, model, vocoder, sentence, monotonic_layers)
2020-07-10 20:22:43 +08:00
sf.write(os.path.join(args.output, "sentence{}.wav".format(i)),
wav, samplerate=config["sample_rate"])
2020-07-13 15:19:52 +08:00
def synthesize(args, config, model, vocoder, sentence, monotonic_layers):
2020-07-10 20:22:43 +08:00
print("[synthesize] {}".format(sentence))
text = en.text_to_sequence(sentence, p=1.0)
text = np.expand_dims(np.array(text, dtype="int64"), 0)
lengths = np.array([text.size], dtype=np.int64)
text_seqs = dg.to_variable(text)
text_lengths = dg.to_variable(lengths)
decoder_layers = config["decoder_layers"]
force_monotonic_attention = [False] * decoder_layers
for i in monotonic_layers:
force_monotonic_attention[i] = True
with dg.no_grad():
outputs = model(text_seqs, text_lengths, speakers=None,
window=(config["backward_step"], config["forward_step"]))
decoded, refined, attentions = outputs
2020-07-13 15:19:52 +08:00
if args.vocoder == "griffin-lim":
wav_np = vocoder(refined.numpy()[0].T)
wav = vocoder(F.transpose(refined, (0, 2, 1)))
wav_np = wav.numpy()[0]
2020-07-10 20:22:43 +08:00
return wav_np
2020-07-13 15:19:52 +08:00
2020-07-10 20:22:43 +08:00
if __name__ == "__main__":
import argparse
from ruamel import yaml
parser = argparse.ArgumentParser("synthesize from a checkpoint")
parser.add_argument("--config", type=str, required=True, help="config file")
parser.add_argument("--input", type=str, required=True, help="text file to synthesize")
parser.add_argument("--output", type=str, required=True, help="path to save audio")
parser.add_argument("--checkpoint", type=str, required=True, help="data path of the checkpoint")
2020-07-14 11:29:49 +08:00
parser.add_argument("--monotonic_layers", type=str, required=True, help="monotonic decoder layers' indices(start from 1)")
2020-07-13 15:19:52 +08:00
parser.add_argument("--vocoder", type=str, default="waveflow", choices=['griffin-lim', 'waveflow'], help="vocoder to use")
2020-07-10 20:22:43 +08:00
args = parser.parse_args()
with open(args.config, 'rt') as f:
config = yaml.safe_load(f)
main(args, config)