101 lines
3.9 KiB
Python
101 lines
3.9 KiB
Python
import numpy as np
|
|
from matplotlib import cm
|
|
import librosa
|
|
import os
|
|
import time
|
|
import tqdm
|
|
import argparse
|
|
from ruamel import yaml
|
|
import paddle
|
|
from paddle import fluid
|
|
from paddle.fluid import layers as F
|
|
from paddle.fluid import dygraph as dg
|
|
from paddle.fluid.io import DataLoader
|
|
import soundfile as sf
|
|
|
|
from parakeet.data import SliceDataset, DataCargo, PartialyRandomizedSimilarTimeLengthSampler, SequentialSampler
|
|
from parakeet.utils.io import save_parameters, load_parameters, add_yaml_config_to_args
|
|
from parakeet.g2p import en
|
|
from parakeet.models.deepvoice3.weight_norm_hook import remove_weight_norm
|
|
from vocoder import WaveflowVocoder, GriffinLimVocoder
|
|
from train import create_model
|
|
|
|
|
|
def main(args, config):
|
|
model = create_model(config)
|
|
loaded_step = load_parameters(model, checkpoint_path=args.checkpoint)
|
|
for name, layer in model.named_sublayers():
|
|
try:
|
|
remove_weight_norm(layer)
|
|
except ValueError:
|
|
# this layer has not weight norm hook
|
|
pass
|
|
model.eval()
|
|
if args.vocoder == "waveflow":
|
|
vocoder = WaveflowVocoder()
|
|
vocoder.model.eval()
|
|
elif args.vocoder == "griffin-lim":
|
|
vocoder = GriffinLimVocoder(
|
|
sharpening_factor=config["sharpening_factor"],
|
|
sample_rate=config["sample_rate"],
|
|
n_fft=config["n_fft"],
|
|
win_length=config["win_length"],
|
|
hop_length=config["hop_length"])
|
|
else:
|
|
raise ValueError("Other vocoders are not supported.")
|
|
|
|
if not os.path.exists(args.output):
|
|
os.makedirs(args.output)
|
|
monotonic_layers = [int(item.strip()) - 1 for item in args.monotonic_layers.split(',')]
|
|
with open(args.input, 'rt') as f:
|
|
sentences = [line.strip() for line in f.readlines()]
|
|
for i, sentence in enumerate(sentences):
|
|
wav = synthesize(args, config, model, vocoder, sentence, monotonic_layers)
|
|
sf.write(os.path.join(args.output, "sentence{}.wav".format(i)),
|
|
wav, samplerate=config["sample_rate"])
|
|
|
|
|
|
def synthesize(args, config, model, vocoder, sentence, monotonic_layers):
|
|
print("[synthesize] {}".format(sentence))
|
|
text = en.text_to_sequence(sentence, p=1.0)
|
|
text = np.expand_dims(np.array(text, dtype="int64"), 0)
|
|
lengths = np.array([text.size], dtype=np.int64)
|
|
text_seqs = dg.to_variable(text)
|
|
text_lengths = dg.to_variable(lengths)
|
|
|
|
decoder_layers = config["decoder_layers"]
|
|
force_monotonic_attention = [False] * decoder_layers
|
|
for i in monotonic_layers:
|
|
force_monotonic_attention[i] = True
|
|
|
|
with dg.no_grad():
|
|
outputs = model(text_seqs, text_lengths, speakers=None,
|
|
force_monotonic_attention=force_monotonic_attention,
|
|
window=(config["backward_step"], config["forward_step"]))
|
|
decoded, refined, attentions = outputs
|
|
if args.vocoder == "griffin-lim":
|
|
wav_np = vocoder(refined.numpy()[0].T)
|
|
else:
|
|
wav = vocoder(F.transpose(refined, (0, 2, 1)))
|
|
wav_np = wav.numpy()[0]
|
|
return wav_np
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
from ruamel import yaml
|
|
parser = argparse.ArgumentParser("synthesize from a checkpoint")
|
|
parser.add_argument("--config", type=str, required=True, help="config file")
|
|
parser.add_argument("--input", type=str, required=True, help="text file to synthesize")
|
|
parser.add_argument("--output", type=str, required=True, help="path to save audio")
|
|
parser.add_argument("--checkpoint", type=str, required=True, help="data path of the checkpoint")
|
|
parser.add_argument("--monotonic_layers", type=str, required=True, help="monotonic decoder layers' indices(start from 1)")
|
|
parser.add_argument("--vocoder", type=str, default="waveflow", choices=['griffin-lim', 'waveflow'], help="vocoder to use")
|
|
args = parser.parse_args()
|
|
with open(args.config, 'rt') as f:
|
|
config = yaml.safe_load(f)
|
|
|
|
dg.enable_dygraph(fluid.CUDAPlace(0))
|
|
main(args, config) |