ParakeetEricRoss/parakeet/models/wavenet/wavenet_modules.py

import itertools
import math

import numpy as np
from paddle import fluid
import paddle.fluid.dygraph as dg
import ops
import weight_norm


def get_padding(filter_size, stride, padding_type='same'):
    if padding_type == 'same':
        padding = [(x - y) // 2 for x, y in zip(filter_size, stride)]
    else:
        raise ValueError("Only support same padding")
    return padding


def debug(x, var_name, rank, verbose=False):
    if not verbose and rank != 0:
        return
    dim = len(x.shape)
    if not isinstance(x, np.ndarray):
        x = x.numpy()
    if dim == 1:
        print("Rank {}".format(rank), var_name, "shape {}, value {}".format(x.shape, x))
    elif dim == 2:
        print("Rank {}".format(rank), var_name, "shape {}, value {}".format(x.shape, x[:, :5]))
    elif dim == 3:
        print("Rank {}".format(rank), var_name, "shape {}, value {}".format(x.shape, x[:, :5, 0]))
    else:
        print("Rank", rank, var_name, "shape", x.shape)


def extract_slices(x, audio_starts, audio_length, rank):
    slices = []
    for i in range(x.shape[0]): 
        start = audio_starts.numpy()[i]
        end = start + audio_length
        slice = fluid.layers.slice(
            x, axes=[0, 1], starts=[i, start], ends=[i+1, end])
        slices.append(fluid.layers.squeeze(slice, [0]))

    x = fluid.layers.stack(slices, axis=0)

    return x


class Conditioner(dg.Layer):
    def __init__(self, name_scope, config):
        super(Conditioner, self).__init__(name_scope)
        upsample_factors = config.conditioner.upsample_factors
        filter_sizes = config.conditioner.filter_sizes
        assert np.prod(upsample_factors) == config.fft_window_shift

        self.deconvs = []
        for i, up_scale in enumerate(upsample_factors):
            stride = (up_scale, 1)
            padding = get_padding(filter_sizes[i], stride)
            self.deconvs.append(
                ops.Conv2DTranspose(
                    self.full_name(),
                    num_filters=1,
                    filter_size=filter_sizes[i],
                    padding=padding,
                    stride=stride))

        # Register python list as parameters.
        for i, layer in enumerate(self.deconvs):
            self.add_sublayer("conv_transpose_{}".format(i), layer)
        
    def forward(self, x):
        x = fluid.layers.unsqueeze(x, 1)
        for layer in self.deconvs:
            x = fluid.layers.leaky_relu(layer(x), alpha=0.4)

        return fluid.layers.squeeze(x, [1])


class WaveNetModule(dg.Layer):
    def __init__(self, name_scope, config, rank):
        super(WaveNetModule, self).__init__(name_scope)
        
        self.rank = rank
        self.conditioner = Conditioner(self.full_name(), config)
        self.dilations = list(
            itertools.islice(
                itertools.cycle(config.dilation_block), config.layers))
        self.context_size = sum(self.dilations) + 1
        self.log_scale_min = config.log_scale_min
        self.config = config

        print("dilations", self.dilations)
        print("context_size", self.context_size)

        if config.loss_type == "softmax":
            self.embedding_fc = ops.Embedding(
                self.full_name(),
                num_embeddings=config.num_channels,
                embed_dim=config.residual_channels)
        elif config.loss_type == "mix-gaussian-pdf":
            self.embedding_fc = ops.FC(
                self.full_name(),
                in_features=1,
                size=config.residual_channels,
                num_flatten_dims=2,
                relu=False)
        else:
            raise ValueError(
                "loss_type {} is unsupported!".format(loss_type))

        self.dilated_causal_convs = []
        for dilation in self.dilations:
            self.dilated_causal_convs.append(
                ops.Conv1D_GU(
                    self.full_name(),
                    conditioner_dim=config.mel_bands,
                    in_channels=config.residual_channels,
                    num_filters=config.residual_channels,
                    filter_size=config.kernel_width,
                    dilation=dilation,
                    causal=True
                )
            )

        for i, layer in enumerate(self.dilated_causal_convs):
            self.add_sublayer("dilated_causal_conv_{}".format(i), layer) 

        self.fc1 = ops.FC(
            self.full_name(),
            in_features=config.residual_channels,
            size=config.skip_channels,
            num_flatten_dims=2,
            relu=True,
            act="relu")

        self.fc2 = ops.FC(
            self.full_name(),
            in_features=config.skip_channels,
            size=config.skip_channels,
            num_flatten_dims=2,
            relu=True,
            act="relu")

        if config.loss_type == "softmax":
            self.fc3 = ops.FC(
                self.full_name(),
                in_features=config.skip_channels,
                size=config.num_channels,
                num_flatten_dims=2,
                relu=False)
        elif config.loss_type == "mix-gaussian-pdf":
            self.fc3 = ops.FC(
                self.full_name(),
                in_features=config.skip_channels,
                size=3 * config.num_mixtures,
                num_flatten_dims=2,
                relu=False)
        else:
            raise ValueError(
                "loss_type {} is unsupported!".format(loss_type))

    def sample_softmax(self, mix_parameters):
        batch, length, hidden = mix_parameters.shape
        mix_param_2d = fluid.layers.reshape(mix_parameters,
            [batch * length, hidden])
        mix_param_2d = fluid.layers.softmax(mix_param_2d, axis=-1)

        # quantized: [batch * length]
        quantized = fluid.layers.cast(fluid.layers.sampling_id(mix_param_2d),
            dtype="float32")
        samples = (quantized + 0.5) * (2.0 / self.config.num_channels) - 1.0

        # samples: [batch * length]
        return samples

    def sample_mix_gaussian(self, mix_parameters):
        # mix_parameters reshape from [bs, 13799, 3 * num_mixtures]
        # to [bs * 13799, 3 * num_mixtures].
        batch, length, hidden = mix_parameters.shape
        mix_param_2d = fluid.layers.reshape(mix_parameters,
            [batch * length, hidden])
        K = hidden // 3

        # Unpack the parameters of the mixture of gaussian.
        logits_pi = mix_param_2d[:, 0 : K]
        mu = mix_param_2d[:, K : 2*K]
        log_s = mix_param_2d[:, 2*K : 3*K]
        s = fluid.layers.exp(log_s)

        pi = fluid.layers.softmax(logits_pi, axis=-1)
        comp_samples = fluid.layers.sampling_id(pi)
        
        row_idx = dg.to_variable(np.arange(batch * length))
        comp_samples = fluid.layers.stack([row_idx, comp_samples], axis=-1)

        mu_comp = fluid.layers.gather_nd(mu, comp_samples)
        s_comp = fluid.layers.gather_nd(s, comp_samples) 

        # N(0, 1) Normal Sample.
        u = fluid.layers.gaussian_random(shape=[batch * length])
        samples = mu_comp + u * s_comp
        samples = fluid.layers.clip(samples, min=-1.0, max=1.0)

        return samples

    def softmax_loss(self, targets, mix_parameters):
        # targets: [bs, 13799] -> [bs, 11752]
        # mix_params: [bs, 13799, 3] -> [bs, 11752, 3]
        targets = targets[:, self.context_size:]
        mix_parameters = mix_parameters[:, self.context_size:, :]

        # Quantized audios to integral values with range [0, num_channels)
        num_channels = self.config.num_channels
        targets = fluid.layers.clip(targets, min=-1.0, max=0.99999)
        quantized = fluid.layers.cast(
            (targets + 1.0) / 2.0 * num_channels, dtype="int64")

        # per_sample_loss shape: [bs, 17952, 1]
        per_sample_loss = fluid.layers.softmax_with_cross_entropy(
            logits=mix_parameters, label=fluid.layers.unsqueeze(quantized, 2))
        loss = fluid.layers.reduce_mean(per_sample_loss)
        #debug(loss, "softmax loss", self.rank)

        return loss

    def mixture_density_loss(self, targets, mix_parameters, log_scale_min):
        # targets: [bs, 13799] -> [bs, 11752]
        # mix_params: [bs, 13799, 3] -> [bs, 11752, 3]
        targets = targets[:, self.context_size:]
        mix_parameters = mix_parameters[:, self.context_size:, :]

        # log_s: [bs, 11752, num_mixture]
        logits_pi, mu, log_s = fluid.layers.split(mix_parameters, num_or_sections=3, dim=-1)

        pi = fluid.layers.softmax(logits_pi, axis=-1)
        log_s = fluid.layers.clip(log_s, min=log_scale_min, max=100.0)
        inv_s = fluid.layers.exp(0.0 - log_s)

        # Calculate gaussian loss.
        targets = fluid.layers.unsqueeze(targets, -1)
        targets = fluid.layers.expand(targets, [1, 1, self.config.num_mixtures])
        x_std =  inv_s * (targets - mu)
        exponent = fluid.layers.exp(-0.5 * x_std * x_std)
        # pdf_x: [bs, 11752, 1]
        pdf_x = 1.0 / np.sqrt(2.0 * np.pi) * inv_s * exponent
        pdf_x = pi * pdf_x
        # pdf_x: [bs, 11752]
        pdf_x = fluid.layers.reduce_sum(pdf_x, dim=-1)
        per_sample_loss = 0.0 - fluid.layers.log(pdf_x + 1e-9)

        loss = fluid.layers.reduce_mean(per_sample_loss)

        return loss

    def forward(self, audios, mels, audio_starts, sample=False):
        # audios: [bs, 13800], mels: [bs, full_frame_length, 80]
        # audio_starts: [bs]
        # Build conditioner based on mels.
        full_conditioner = self.conditioner(mels)

        # Slice conditioners.
        audio_length = audios.shape[1]
        conditioner = extract_slices(full_conditioner,
            audio_starts, audio_length, self.rank)
    
        # input_audio, target_audio: [bs, 13799]
        input_audios = audios[:, :-1]
        target_audios = audios[:, 1:]
        # conditioner: [bs, 13799, 80]
        conditioner = conditioner[:, 1:, :]

        loss_type = self.config.loss_type

        # layer_input: [bs, 13799, 128]
        if loss_type == "softmax":
            input_audios = fluid.layers.clip(
                input_audios, min=-1.0, max=0.99999)
            # quantized have values in [0, num_channels)
            quantized = fluid.layers.cast(
                (input_audios + 1.0) / 2.0 * self.config.num_channels,
                dtype="int64")
            layer_input = self.embedding_fc(fluid.layers.unsqueeze(quantized, 2))
        elif loss_type == "mix-gaussian-pdf":
            layer_input = self.embedding_fc(fluid.layers.unsqueeze(input_audios, 2))
        else:
            raise ValueError(
                "loss_type {} is unsupported!".format(loss_type))

        # layer_input: [bs, res_channel, 1, 13799]
        layer_input = fluid.layers.unsqueeze(fluid.layers.transpose(layer_input, perm=[0, 2, 1]), 2)
        # conditioner: [bs, mel_bands, 1, 13799]
        conditioner = fluid.layers.unsqueeze(fluid.layers.transpose(conditioner, perm=[0, 2, 1]), 2)

        # layer_input: [bs, res_channel, 1, 13799]
        # skip: [bs, res_channel, 1, 13799]
        skip = None
        for i, layer in enumerate(self.dilated_causal_convs):
            layer_input, skip = layer(layer_input, skip, conditioner)
            #debug(layer_input, "layer_input_" + str(i), self.rank)
            #debug(skip, "skip_" + str(i), self.rank)

        # Reshape skip to [bs, 13799, res_channel]
        skip = fluid.layers.transpose(fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
        #debug(skip, "skip", self.rank)

        # mix_param: [bs, 13799, 3 * num_mixtures]
        mix_parameters = self.fc3(self.fc2(self.fc1(skip)))

        # Sample teacher-forced audio.
        sample_audios = None
        if sample:
            if loss_type == "softmax":
                sample_audios = self.sample_softmax(mix_parameters)
            elif loss_type == "mix-gaussian-pdf":
                sample_audios = self.sample_mix_gaussian(mix_parameters)
            else:
                raise ValueError(
                    "loss_type {} is unsupported!".format(loss_type))
            #debug(sample_audios, "sample_audios", self.rank)

        # Calculate mix-gaussian density loss.
        # padding is all zero.
        # target_audio: [bs, 13799].
        # mix_params: [bs, 13799, 3].
        if loss_type == "softmax":
            loss = self.softmax_loss(target_audios, mix_parameters)
        elif loss_type == "mix-gaussian-pdf":
            loss = self.mixture_density_loss(target_audios,
                mix_parameters, self.log_scale_min)
        else:
            raise ValueError(
                "loss_type {} is unsupported!".format(loss_type))

        #print("Rank {}, loss {}".format(self.rank, loss.numpy()))
 
        return loss, sample_audios

    def synthesize(self, mels):
        self.start_new_sequence()
        print("input mels shape", mels.shape)
        # mels: [bs=1, n_frames, 80]
        # conditioner: [1, n_frames * samples_per_frame, 80]
        # Should I move forward by one sample? No difference
        # Append context frame to mels
        bs, n_frames, mel_bands = mels.shape 
        #num_pad_frames = int(np.ceil(self.context_size / self.config.fft_window_shift))
        #silence = fluid.layers.zeros(shape=[bs, num_pad_frames, mel_bands], dtype="float32")
        #inf_mels = fluid.layers.concat([silence, mels], axis=1)
        #print("padded mels shape", inf_mels.shape)

        #conditioner = self.conditioner(inf_mels)[:, self.context_size:, :]
        conditioner = self.conditioner(mels)
        time_steps = conditioner.shape[1]
        print("Total steps", time_steps)

        loss_type = self.config.loss_type
        audio_samples = []
        current_sample = fluid.layers.zeros(shape=[bs, 1, 1], dtype="float32")
        for i in range(time_steps):
            if i % 100 == 0:
                print("Step", i)

            # convert from real value sample to audio embedding.
            # [bs, 1, 128]
            if loss_type == "softmax":
                current_sample = fluid.layers.clip(
                    current_sample, min=-1.0, max=0.99999)
                # quantized have values in [0, num_channels)
                quantized = fluid.layers.cast(
                    (current_sample + 1.0) / 2.0 * self.config.num_channels,
                    dtype="int64")
                audio_input = self.embedding_fc(quantized)
            elif loss_type == "mix-gaussian-pdf":
                audio_input = self.embedding_fc(current_sample)
            else:
                raise ValueError(
                    "loss_type {} is unsupported!".format(loss_type))

            # [bs, 128, 1, 1]
            audio_input = fluid.layers.unsqueeze(fluid.layers.transpose(audio_input, perm=[0, 2, 1]), 2)
            # [bs, 80]
            cond_input = conditioner[:, i, :]
            # [bs, 80, 1, 1]
            cond_input = fluid.layers.reshape(
                cond_input, cond_input.shape + [1, 1])

            skip = None
            for layer in self.dilated_causal_convs:
                audio_input, skip = layer.add_input(audio_input, skip, cond_input)
            
            # [bs, 1, 128]
            skip = fluid.layers.transpose(fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
            # [bs, 1, 3]
            mix_parameters = self.fc3(self.fc2(self.fc1(skip)))
            if loss_type == "softmax":
                sample = self.sample_softmax(mix_parameters)
            elif loss_type == "mix-gaussian-pdf":
                sample = self.sample_mix_gaussian(mix_parameters)
            else:
                raise ValueError(
                    "loss_type {} is unsupported!".format(loss_type))
            audio_samples.append(sample)
            # [bs]
            current_sample = audio_samples[-1]
            # [bs, 1, 1]
            current_sample = fluid.layers.reshape(current_sample,
                current_sample.shape + [1, 1])

        # syn_audio: (num_samples,)
        syn_audio = fluid.layers.concat(audio_samples, axis=0).numpy()

        return syn_audio        

    def start_new_sequence(self):
        for layer in self.sublayers():
            if isinstance(layer, weight_norm.Conv1D):
                layer.start_new_sequence()

    def save(self, iteration):
        utils.save_latest_parameters(self.checkpoint_dir, iteration,
                                     self.wavenet, self.optimizer)
        utils.save_latest_checkpoint(self.checkpoint_dir, iteration)
working integraton with parakeet 2019-12-03 06:00:53 +08:00			`import itertools`
			`import math`

			`import numpy as np`
			`from paddle import fluid`
			`import paddle.fluid.dygraph as dg`
			`import ops`
			`import weight_norm`


			`def get_padding(filter_size, stride, padding_type='same'):`
			`if padding_type == 'same':`
			`padding = [(x - y) // 2 for x, y in zip(filter_size, stride)]`
			`else:`
			`raise ValueError("Only support same padding")`
			`return padding`


			`def debug(x, var_name, rank, verbose=False):`
			`if not verbose and rank != 0:`
			`return`
			`dim = len(x.shape)`
			`if not isinstance(x, np.ndarray):`
			`x = x.numpy()`
			`if dim == 1:`
			`print("Rank {}".format(rank), var_name, "shape {}, value {}".format(x.shape, x))`
			`elif dim == 2:`
			`print("Rank {}".format(rank), var_name, "shape {}, value {}".format(x.shape, x[:, :5]))`
			`elif dim == 3:`
			`print("Rank {}".format(rank), var_name, "shape {}, value {}".format(x.shape, x[:, :5, 0]))`
			`else:`
			`print("Rank", rank, var_name, "shape", x.shape)`


			`def extract_slices(x, audio_starts, audio_length, rank):`
			`slices = []`
			`for i in range(x.shape[0]):`
			`start = audio_starts.numpy()[i]`
			`end = start + audio_length`
			`slice = fluid.layers.slice(`
			`x, axes=[0, 1], starts=[i, start], ends=[i+1, end])`
			`slices.append(fluid.layers.squeeze(slice, [0]))`

			`x = fluid.layers.stack(slices, axis=0)`

			`return x`


			`class Conditioner(dg.Layer):`
			`def __init__(self, name_scope, config):`
			`super(Conditioner, self).__init__(name_scope)`
			`upsample_factors = config.conditioner.upsample_factors`
			`filter_sizes = config.conditioner.filter_sizes`
			`assert np.prod(upsample_factors) == config.fft_window_shift`

			`self.deconvs = []`
			`for i, up_scale in enumerate(upsample_factors):`
			`stride = (up_scale, 1)`
			`padding = get_padding(filter_sizes[i], stride)`
			`self.deconvs.append(`
			`ops.Conv2DTranspose(`
			`self.full_name(),`
			`num_filters=1,`
			`filter_size=filter_sizes[i],`
			`padding=padding,`
			`stride=stride))`

			`# Register python list as parameters.`
			`for i, layer in enumerate(self.deconvs):`
			`self.add_sublayer("conv_transpose_{}".format(i), layer)`

			`def forward(self, x):`
			`x = fluid.layers.unsqueeze(x, 1)`
			`for layer in self.deconvs:`
			`x = fluid.layers.leaky_relu(layer(x), alpha=0.4)`

			`return fluid.layers.squeeze(x, [1])`


			`class WaveNetModule(dg.Layer):`
			`def __init__(self, name_scope, config, rank):`
			`super(WaveNetModule, self).__init__(name_scope)`

			`self.rank = rank`
			`self.conditioner = Conditioner(self.full_name(), config)`
			`self.dilations = list(`
			`itertools.islice(`
			`itertools.cycle(config.dilation_block), config.layers))`
			`self.context_size = sum(self.dilations) + 1`
			`self.log_scale_min = config.log_scale_min`
			`self.config = config`

			`print("dilations", self.dilations)`
			`print("context_size", self.context_size)`

			`if config.loss_type == "softmax":`
			`self.embedding_fc = ops.Embedding(`
			`self.full_name(),`
			`num_embeddings=config.num_channels,`
			`embed_dim=config.residual_channels)`
			`elif config.loss_type == "mix-gaussian-pdf":`
			`self.embedding_fc = ops.FC(`
			`self.full_name(),`
			`in_features=1,`
			`size=config.residual_channels,`
			`num_flatten_dims=2,`
			`relu=False)`
			`else:`
			`raise ValueError(`
			`"loss_type {} is unsupported!".format(loss_type))`

			`self.dilated_causal_convs = []`
			`for dilation in self.dilations:`
			`self.dilated_causal_convs.append(`
			`ops.Conv1D_GU(`
			`self.full_name(),`
			`conditioner_dim=config.mel_bands,`
			`in_channels=config.residual_channels,`
			`num_filters=config.residual_channels,`
			`filter_size=config.kernel_width,`
			`dilation=dilation,`
			`causal=True`
			`)`
			`)`

			`for i, layer in enumerate(self.dilated_causal_convs):`
			`self.add_sublayer("dilated_causal_conv_{}".format(i), layer)`

			`self.fc1 = ops.FC(`
			`self.full_name(),`
			`in_features=config.residual_channels,`
			`size=config.skip_channels,`
			`num_flatten_dims=2,`
			`relu=True,`
			`act="relu")`

			`self.fc2 = ops.FC(`
			`self.full_name(),`
			`in_features=config.skip_channels,`
			`size=config.skip_channels,`
			`num_flatten_dims=2,`
			`relu=True,`
			`act="relu")`

			`if config.loss_type == "softmax":`
			`self.fc3 = ops.FC(`
			`self.full_name(),`
			`in_features=config.skip_channels,`
			`size=config.num_channels,`
			`num_flatten_dims=2,`
			`relu=False)`
			`elif config.loss_type == "mix-gaussian-pdf":`
			`self.fc3 = ops.FC(`
			`self.full_name(),`
			`in_features=config.skip_channels,`
			`size=3 * config.num_mixtures,`
			`num_flatten_dims=2,`
			`relu=False)`
			`else:`
			`raise ValueError(`
			`"loss_type {} is unsupported!".format(loss_type))`

			`def sample_softmax(self, mix_parameters):`
			`batch, length, hidden = mix_parameters.shape`
			`mix_param_2d = fluid.layers.reshape(mix_parameters,`
			`[batch * length, hidden])`
			`mix_param_2d = fluid.layers.softmax(mix_param_2d, axis=-1)`

			`# quantized: [batch * length]`
			`quantized = fluid.layers.cast(fluid.layers.sampling_id(mix_param_2d),`
			`dtype="float32")`
			`samples = (quantized + 0.5) * (2.0 / self.config.num_channels) - 1.0`

			`# samples: [batch * length]`
			`return samples`

			`def sample_mix_gaussian(self, mix_parameters):`
			`# mix_parameters reshape from [bs, 13799, 3 * num_mixtures]`
			`# to [bs * 13799, 3 * num_mixtures].`
			`batch, length, hidden = mix_parameters.shape`
			`mix_param_2d = fluid.layers.reshape(mix_parameters,`
			`[batch * length, hidden])`
			`K = hidden // 3`

			`# Unpack the parameters of the mixture of gaussian.`
			`logits_pi = mix_param_2d[:, 0 : K]`
			`mu = mix_param_2d[:, K : 2*K]`
			`log_s = mix_param_2d[:, 2K : 3K]`
			`s = fluid.layers.exp(log_s)`

			`pi = fluid.layers.softmax(logits_pi, axis=-1)`
			`comp_samples = fluid.layers.sampling_id(pi)`

			`row_idx = dg.to_variable(np.arange(batch * length))`
			`comp_samples = fluid.layers.stack([row_idx, comp_samples], axis=-1)`

			`mu_comp = fluid.layers.gather_nd(mu, comp_samples)`
			`s_comp = fluid.layers.gather_nd(s, comp_samples)`

			`# N(0, 1) Normal Sample.`
			`u = fluid.layers.gaussian_random(shape=[batch * length])`
			`samples = mu_comp + u * s_comp`
			`samples = fluid.layers.clip(samples, min=-1.0, max=1.0)`

			`return samples`

			`def softmax_loss(self, targets, mix_parameters):`
			`# targets: [bs, 13799] -> [bs, 11752]`
			`# mix_params: [bs, 13799, 3] -> [bs, 11752, 3]`
			`targets = targets[:, self.context_size:]`
			`mix_parameters = mix_parameters[:, self.context_size:, :]`

			`# Quantized audios to integral values with range [0, num_channels)`
			`num_channels = self.config.num_channels`
			`targets = fluid.layers.clip(targets, min=-1.0, max=0.99999)`
			`quantized = fluid.layers.cast(`
			`(targets + 1.0) / 2.0 * num_channels, dtype="int64")`

			`# per_sample_loss shape: [bs, 17952, 1]`
			`per_sample_loss = fluid.layers.softmax_with_cross_entropy(`
			`logits=mix_parameters, label=fluid.layers.unsqueeze(quantized, 2))`
			`loss = fluid.layers.reduce_mean(per_sample_loss)`
			`#debug(loss, "softmax loss", self.rank)`

			`return loss`

			`def mixture_density_loss(self, targets, mix_parameters, log_scale_min):`
			`# targets: [bs, 13799] -> [bs, 11752]`
			`# mix_params: [bs, 13799, 3] -> [bs, 11752, 3]`
			`targets = targets[:, self.context_size:]`
			`mix_parameters = mix_parameters[:, self.context_size:, :]`

			`# log_s: [bs, 11752, num_mixture]`
			`logits_pi, mu, log_s = fluid.layers.split(mix_parameters, num_or_sections=3, dim=-1)`

			`pi = fluid.layers.softmax(logits_pi, axis=-1)`
			`log_s = fluid.layers.clip(log_s, min=log_scale_min, max=100.0)`
			`inv_s = fluid.layers.exp(0.0 - log_s)`

			`# Calculate gaussian loss.`
			`targets = fluid.layers.unsqueeze(targets, -1)`
			`targets = fluid.layers.expand(targets, [1, 1, self.config.num_mixtures])`
			`x_std = inv_s * (targets - mu)`
			`exponent = fluid.layers.exp(-0.5 * x_std * x_std)`
			`# pdf_x: [bs, 11752, 1]`
			`pdf_x = 1.0 / np.sqrt(2.0 * np.pi) * inv_s * exponent`
			`pdf_x = pi * pdf_x`
			`# pdf_x: [bs, 11752]`
			`pdf_x = fluid.layers.reduce_sum(pdf_x, dim=-1)`
			`per_sample_loss = 0.0 - fluid.layers.log(pdf_x + 1e-9)`

			`loss = fluid.layers.reduce_mean(per_sample_loss)`

			`return loss`

			`def forward(self, audios, mels, audio_starts, sample=False):`
			`# audios: [bs, 13800], mels: [bs, full_frame_length, 80]`
			`# audio_starts: [bs]`
			`# Build conditioner based on mels.`
			`full_conditioner = self.conditioner(mels)`

			`# Slice conditioners.`
			`audio_length = audios.shape[1]`
			`conditioner = extract_slices(full_conditioner,`
			`audio_starts, audio_length, self.rank)`

			`# input_audio, target_audio: [bs, 13799]`
			`input_audios = audios[:, :-1]`
			`target_audios = audios[:, 1:]`
			`# conditioner: [bs, 13799, 80]`
			`conditioner = conditioner[:, 1:, :]`

			`loss_type = self.config.loss_type`

			`# layer_input: [bs, 13799, 128]`
			`if loss_type == "softmax":`
			`input_audios = fluid.layers.clip(`
			`input_audios, min=-1.0, max=0.99999)`
			`# quantized have values in [0, num_channels)`
			`quantized = fluid.layers.cast(`
			`(input_audios + 1.0) / 2.0 * self.config.num_channels,`
			`dtype="int64")`
			`layer_input = self.embedding_fc(fluid.layers.unsqueeze(quantized, 2))`
			`elif loss_type == "mix-gaussian-pdf":`
			`layer_input = self.embedding_fc(fluid.layers.unsqueeze(input_audios, 2))`
			`else:`
			`raise ValueError(`
			`"loss_type {} is unsupported!".format(loss_type))`

			`# layer_input: [bs, res_channel, 1, 13799]`
			`layer_input = fluid.layers.unsqueeze(fluid.layers.transpose(layer_input, perm=[0, 2, 1]), 2)`
			`# conditioner: [bs, mel_bands, 1, 13799]`
			`conditioner = fluid.layers.unsqueeze(fluid.layers.transpose(conditioner, perm=[0, 2, 1]), 2)`

			`# layer_input: [bs, res_channel, 1, 13799]`
			`# skip: [bs, res_channel, 1, 13799]`
			`skip = None`
			`for i, layer in enumerate(self.dilated_causal_convs):`
			`layer_input, skip = layer(layer_input, skip, conditioner)`
			`#debug(layer_input, "layer_input_" + str(i), self.rank)`
			`#debug(skip, "skip_" + str(i), self.rank)`

			`# Reshape skip to [bs, 13799, res_channel]`
			`skip = fluid.layers.transpose(fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])`
			`#debug(skip, "skip", self.rank)`

			`# mix_param: [bs, 13799, 3 * num_mixtures]`
			`mix_parameters = self.fc3(self.fc2(self.fc1(skip)))`

			`# Sample teacher-forced audio.`
			`sample_audios = None`
			`if sample:`
			`if loss_type == "softmax":`
			`sample_audios = self.sample_softmax(mix_parameters)`
			`elif loss_type == "mix-gaussian-pdf":`
			`sample_audios = self.sample_mix_gaussian(mix_parameters)`
			`else:`
			`raise ValueError(`
			`"loss_type {} is unsupported!".format(loss_type))`
			`#debug(sample_audios, "sample_audios", self.rank)`

			`# Calculate mix-gaussian density loss.`
			`# padding is all zero.`
			`# target_audio: [bs, 13799].`
			`# mix_params: [bs, 13799, 3].`
			`if loss_type == "softmax":`
			`loss = self.softmax_loss(target_audios, mix_parameters)`
			`elif loss_type == "mix-gaussian-pdf":`
			`loss = self.mixture_density_loss(target_audios,`
			`mix_parameters, self.log_scale_min)`
			`else:`
			`raise ValueError(`
			`"loss_type {} is unsupported!".format(loss_type))`

			`#print("Rank {}, loss {}".format(self.rank, loss.numpy()))`

			`return loss, sample_audios`

			`def synthesize(self, mels):`
			`self.start_new_sequence()`
			`print("input mels shape", mels.shape)`
			`# mels: [bs=1, n_frames, 80]`
			`# conditioner: [1, n_frames * samples_per_frame, 80]`
			`# Should I move forward by one sample? No difference`
			`# Append context frame to mels`
			`bs, n_frames, mel_bands = mels.shape`
			`#num_pad_frames = int(np.ceil(self.context_size / self.config.fft_window_shift))`
			`#silence = fluid.layers.zeros(shape=[bs, num_pad_frames, mel_bands], dtype="float32")`
			`#inf_mels = fluid.layers.concat([silence, mels], axis=1)`
			`#print("padded mels shape", inf_mels.shape)`

			`#conditioner = self.conditioner(inf_mels)[:, self.context_size:, :]`
			`conditioner = self.conditioner(mels)`
			`time_steps = conditioner.shape[1]`
			`print("Total steps", time_steps)`

			`loss_type = self.config.loss_type`
			`audio_samples = []`
			`current_sample = fluid.layers.zeros(shape=[bs, 1, 1], dtype="float32")`
			`for i in range(time_steps):`
			`if i % 100 == 0:`
			`print("Step", i)`

			`# convert from real value sample to audio embedding.`
			`# [bs, 1, 128]`
			`if loss_type == "softmax":`
			`current_sample = fluid.layers.clip(`
			`current_sample, min=-1.0, max=0.99999)`
			`# quantized have values in [0, num_channels)`
			`quantized = fluid.layers.cast(`
			`(current_sample + 1.0) / 2.0 * self.config.num_channels,`
			`dtype="int64")`
			`audio_input = self.embedding_fc(quantized)`
			`elif loss_type == "mix-gaussian-pdf":`
			`audio_input = self.embedding_fc(current_sample)`
			`else:`
			`raise ValueError(`
			`"loss_type {} is unsupported!".format(loss_type))`

			`# [bs, 128, 1, 1]`
			`audio_input = fluid.layers.unsqueeze(fluid.layers.transpose(audio_input, perm=[0, 2, 1]), 2)`
			`# [bs, 80]`
			`cond_input = conditioner[:, i, :]`
			`# [bs, 80, 1, 1]`
			`cond_input = fluid.layers.reshape(`
			`cond_input, cond_input.shape + [1, 1])`

			`skip = None`
			`for layer in self.dilated_causal_convs:`
			`audio_input, skip = layer.add_input(audio_input, skip, cond_input)`

			`# [bs, 1, 128]`
			`skip = fluid.layers.transpose(fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])`
			`# [bs, 1, 3]`
			`mix_parameters = self.fc3(self.fc2(self.fc1(skip)))`
			`if loss_type == "softmax":`
			`sample = self.sample_softmax(mix_parameters)`
			`elif loss_type == "mix-gaussian-pdf":`
			`sample = self.sample_mix_gaussian(mix_parameters)`
			`else:`
			`raise ValueError(`
			`"loss_type {} is unsupported!".format(loss_type))`
			`audio_samples.append(sample)`
			`# [bs]`
			`current_sample = audio_samples[-1]`
			`# [bs, 1, 1]`
			`current_sample = fluid.layers.reshape(current_sample,`
			`current_sample.shape + [1, 1])`

			`# syn_audio: (num_samples,)`
			`syn_audio = fluid.layers.concat(audio_samples, axis=0).numpy()`

			`return syn_audio`

			`def start_new_sequence(self):`
			`for layer in self.sublayers():`
			`if isinstance(layer, weight_norm.Conv1D):`
			`layer.start_new_sequence()`

			`def save(self, iteration):`
			`utils.save_latest_parameters(self.checkpoint_dir, iteration,`
			`self.wavenet, self.optimizer)`
			`utils.save_latest_checkpoint(self.checkpoint_dir, iteration)`