# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import sys import os from os.path import join, expanduser from warnings import warn from datetime import datetime import matplotlib # Force matplotlib not to use any Xwindows backend. matplotlib.use("Agg") from matplotlib import pyplot as plt from matplotlib import cm import audio import numpy as np from paddle import fluid import paddle.fluid.dygraph as dg import librosa.display from tensorboardX import SummaryWriter # import global hyper parameters from hparams import hparams from parakeet import g2p as frontend _frontend = getattr(frontend, hparams.frontend) def tts(model, text, p=0., speaker_id=None): """ Convert text to speech waveform given a deepvoice3 model. Args: model (DeepVoiceTTS): Model used to synthesize waveform. text (str) : Input text to be synthesized p (float) : Replace word to pronounciation if p > 0. Default is 0. Returns: waveform (numpy.ndarray): Shape(T_wav, ), predicted wave form, where T_wav means the length of the synthesized wave form. alignment (numpy.ndarray): Shape(T_dec, T_enc), predicted alignment matrix, where T_dec means the time steps of decoder outputs, T_enc means the time steps of encoder outoputs. spectrogram (numpy.ndarray): Shape(T_lin, C_lin), predicted linear spectrogram, where T__lin means the time steps of linear spectrogram and C_lin mean sthe channels of linear spectrogram. mel (numpy.ndarray): Shape(T_mel, C_mel), predicted mel spectrogram, where T_mel means the time steps of mel spectrogram and C_mel means the channels of mel spectrogram. """ model.eval() sequence = np.array(_frontend.text_to_sequence(text, p=p)).astype("int64") sequence = np.reshape(sequence, (1, -1, 1)) text_positions = np.arange(1, sequence.shape[1] + 1, dtype="int64") text_positions = np.reshape(text_positions, (1, -1, 1)) sequence = dg.to_variable(sequence) text_positions = dg.to_variable(text_positions) speaker_ids = None if speaker_id is None else fluid.layers.fill_constant( shape=[1, 1], value=speaker_id) # sequence: shape(1, input_length, 1) # text_positions: shape(1, input_length, 1) # Greedy decoding mel_outputs, linear_outputs, alignments, done = model.transduce( sequence, text_positions, speaker_ids) # reshape to the desired shape linear_output = linear_outputs.numpy().squeeze().T spectrogram = audio._denormalize(linear_output) alignment = alignments.numpy()[0] mel = mel_outputs.numpy().squeeze().T mel = audio._denormalize(mel) # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram, mel def prepare_spec_image(spectrogram): """ Prepare an image from spectrogram to be written to tensorboardX summary writer. Args: spectrogram (numpy.ndarray): Shape(T, C), spectrogram to be visualized, where T means the time steps of the spectrogram, and C means the channels of the spectrogram. Return: np.ndarray: Shape(C, T, 4), the generated image of the spectrogram, where T means the time steps of the spectrogram. It is treated as the width of the image. And C means the channels of the spectrogram, which is treated as the height of the image. And 4 means it is a 'ARGB' format. """ # [0, 1] spectrogram = (spectrogram - np.min(spectrogram)) / ( np.max(spectrogram) - np.min(spectrogram)) spectrogram = np.flip(spectrogram, axis=1) # flip against freq axis return np.uint8(cm.magma(spectrogram.T) * 255) def plot_alignment(alignment, path, info=None): fig, ax = plt.subplots() im = ax.imshow( alignment, aspect="auto", origin="lower", interpolation="none") fig.colorbar(im, ax=ax) xlabel = "Decoder timestep" if info is not None: xlabel += "\n\n" + info plt.xlabel(xlabel) plt.ylabel("Encoder timestep") plt.tight_layout() plt.savefig(path, format="png") plt.close() def time_string(): return datetime.now().strftime("%Y-%m-%d %H:%M") def save_alignment(global_step, path, attn): plot_alignment( attn.T, path, info="{}, {}, step={}".format(hparams.builder, time_string(), global_step)) def eval_model(global_step, writer, model, checkpoint_dir, ismultispeaker): # hard coded text sequences texts = [ "Scientists at the CERN laboratory say they have discovered a new particle.", "There's a way to measure the acute emotional intelligence that has never gone out of style.", "President Trump met with other leaders at the Group of 20 conference.", "Generative adversarial network or variational auto-encoder.", "Please call Stella.", "Some have accepted this as a miracle without any physical explanation.", ] eval_output_dir = join(checkpoint_dir, "eval") if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) print("[eval] Evaluating the model, results are saved in {}".format( eval_output_dir)) model.eval() # hard coded speaker_ids = [0, 1, 10] if ismultispeaker else [None] for speaker_id in speaker_ids: speaker_str = ("multispeaker{}".format(speaker_id) if speaker_id is not None else "single") for idx, text in enumerate(texts): signal, alignment, _, mel = tts(model, text, p=0, speaker_id=speaker_id) signal /= np.max(np.abs(signal)) # Alignment path = join(eval_output_dir, "step{:09d}_text{}_{}_alignment.png".format( global_step, idx, speaker_str)) save_alignment(global_step, path, alignment) tag = "eval_averaged_alignment_{}_{}".format(idx, speaker_str) writer.add_image( tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step, dataformats='HWC') # Mel writer.add_image( "(Eval) Predicted mel spectrogram text{}_{}".format( idx, speaker_str), prepare_spec_image(mel), global_step, dataformats='HWC') # Audio path = join(eval_output_dir, "step{:09d}_text{}_{}_predicted.wav".format( global_step, idx, speaker_str)) audio.save_wav(signal, path) try: writer.add_audio( "(Eval) Predicted audio signal {}_{}".format(idx, speaker_str), signal, global_step, sample_rate=hparams.sample_rate) except Exception as e: warn(str(e)) pass def save_states(global_step, writer, mel_outputs, linear_outputs, attn, mel, y, input_lengths, checkpoint_dir=None): """ Save states for the trainning process. """ print("[train] Saving intermediate states at step {}".format(global_step)) idx = min(1, len(input_lengths) - 1) input_length = input_lengths[idx] # Alignment, Multi-hop attention if attn is not None and len(attn.shape) == 4: attn = attn.numpy() for i in range(attn.shape[0]): alignment = attn[i] alignment = alignment[idx] tag = "alignment_layer{}".format(i + 1) writer.add_image( tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step, dataformats='HWC') alignment_dir = join(checkpoint_dir, "alignment_layer{}".format(i + 1)) if not os.path.exists(alignment_dir): os.makedirs(alignment_dir) path = join( alignment_dir, "step{:09d}_layer_{}_alignment.png".format(global_step, i + 1)) save_alignment(global_step, path, alignment) alignment_dir = join(checkpoint_dir, "alignment_ave") if not os.path.exists(alignment_dir): os.makedirs(alignment_dir) path = join(alignment_dir, "step{:09d}_alignment.png".format(global_step)) alignment = np.mean(attn, axis=0)[idx] save_alignment(global_step, path, alignment) tag = "averaged_alignment" writer.add_image( tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step, dataformats="HWC") if mel_outputs is not None: mel_output = mel_outputs[idx].numpy().squeeze().T mel_output = prepare_spec_image(audio._denormalize(mel_output)) writer.add_image( "Predicted mel spectrogram", mel_output, global_step, dataformats="HWC") if linear_outputs is not None: linear_output = linear_outputs[idx].numpy().squeeze().T spectrogram = prepare_spec_image(audio._denormalize(linear_output)) writer.add_image( "Predicted linear spectrogram", spectrogram, global_step, dataformats="HWC") signal = audio.inv_spectrogram(linear_output.T) signal /= np.max(np.abs(signal)) path = join(checkpoint_dir, "step{:09d}_predicted.wav".format(global_step)) try: writer.add_audio( "Predicted audio signal", signal, global_step, sample_rate=hparams.sample_rate) except Exception as e: warn(str(e)) pass audio.save_wav(signal, path) if mel_outputs is not None: mel_output = mel[idx].numpy().squeeze().T mel_output = prepare_spec_image(audio._denormalize(mel_output)) writer.add_image( "Target mel spectrogram", mel_output, global_step, dataformats="HWC") if linear_outputs is not None: linear_output = y[idx].numpy().squeeze().T spectrogram = prepare_spec_image(audio._denormalize(linear_output)) writer.add_image( "Target linear spectrogram", spectrogram, global_step, dataformats="HWC")