Parakeet/examples/waveflow/data.py

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import random

import librosa
import numpy as np
from paddle import fluid

from parakeet.datasets import ljspeech
from parakeet.data import SpecBatcher, WavBatcher
from parakeet.data import DataCargo, DatasetMixin
from parakeet.data import DistributedSampler, BatchSampler
from scipy.io.wavfile import read


class Dataset(ljspeech.LJSpeech):
    def __init__(self, config):
        super(Dataset, self).__init__(config.root)
        self.config = config

    def _get_example(self, metadatum):
        fname, _, _ = metadatum
        wav_path = os.path.join(self.root, "wavs", fname + ".wav")

        loaded_sr, audio = read(wav_path)
        assert loaded_sr == self.config.sample_rate

        return audio


class Subset(DatasetMixin):
    def __init__(self, dataset, indices, valid):
        self.dataset = dataset
        self.indices = indices
        self.valid = valid
        self.config = dataset.config

    def get_mel(self, audio):
        spectrogram = librosa.core.stft(
            audio,
            n_fft=self.config.fft_size,
            hop_length=self.config.fft_window_shift,
            win_length=self.config.fft_window_size)
        spectrogram_magnitude = np.abs(spectrogram)

        # mel_filter_bank shape: [n_mels, 1 + n_fft/2]
        mel_filter_bank = librosa.filters.mel(sr=self.config.sample_rate,
                                              n_fft=self.config.fft_size,
                                              n_mels=self.config.mel_bands,
                                              fmin=self.config.mel_fmin,
                                              fmax=self.config.mel_fmax)
        # mel shape: [n_mels, num_frames]
        mel = np.dot(mel_filter_bank, spectrogram_magnitude)

        # Normalize mel.
        clip_val = 1e-5
        ref_constant = 1
        mel = np.log(np.clip(mel, a_min=clip_val, a_max=None) * ref_constant)

        return mel

    def __getitem__(self, idx):
        audio = self.dataset[self.indices[idx]]
        segment_length = self.config.segment_length

        if self.valid:
            # whole audio for valid set
            pass
        else:
            # Randomly crop segment_length from audios in the training set.
            # audio shape: [len]
            if audio.shape[0] >= segment_length:
                max_audio_start = audio.shape[0] - segment_length
                audio_start = random.randint(0, max_audio_start)
                audio = audio[audio_start:(audio_start + segment_length)]
            else:
                audio = np.pad(audio, (0, segment_length - audio.shape[0]),
                               mode='constant',
                               constant_values=0)

        # Normalize audio to the [-1, 1] range.
        audio = audio.astype(np.float32) / 32768.0
        mel = self.get_mel(audio)

        return audio, mel

    def _batch_examples(self, batch):
        audios = [sample[0] for sample in batch]
        mels = [sample[1] for sample in batch]

        audios = WavBatcher(pad_value=0.0)(audios)
        mels = SpecBatcher(pad_value=0.0)(mels)

        return audios, mels

    def __len__(self):
        return len(self.indices)


class LJSpeech:
    def __init__(self, config, nranks, rank):
        place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()

        # Whole LJSpeech dataset.
        ds = Dataset(config)

        # Split into train and valid dataset.
        indices = list(range(len(ds)))
        train_indices = indices[config.valid_size:]
        valid_indices = indices[:config.valid_size]
        random.shuffle(train_indices)

        # Train dataset.
        trainset = Subset(ds, train_indices, valid=False)
        sampler = DistributedSampler(len(trainset), nranks, rank)
        total_bs = config.batch_size
        assert total_bs % nranks == 0
        train_sampler = BatchSampler(
            sampler, total_bs // nranks, drop_last=True)
        trainloader = DataCargo(trainset, batch_sampler=train_sampler)

        trainreader = fluid.io.PyReader(capacity=50, return_list=True)
        trainreader.decorate_batch_generator(trainloader, place)
        self.trainloader = (data for _ in iter(int, 1)
                            for data in trainreader())

        # Valid dataset.
        validset = Subset(ds, valid_indices, valid=True)
        # Currently only support batch_size = 1 for valid loader.
        validloader = DataCargo(validset, batch_size=1, shuffle=False)

        validreader = fluid.io.PyReader(capacity=20, return_list=True)
        validreader.decorate_batch_generator(validloader, place)
        self.validloader = validreader