Parakeet/examples/wavenet/data.py

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import division
import csv
import numpy as np
import librosa
from pathlib import Path
import pandas as pd

from parakeet.data import batch_spec, batch_wav
from parakeet.data import DatasetMixin


class LJSpeechMetaData(DatasetMixin):
    def __init__(self, root):
        self.root = Path(root)
        self._wav_dir = self.root.joinpath("wavs")
        csv_path = self.root.joinpath("metadata.csv")
        self._table = pd.read_csv(
            csv_path,
            sep="|",
            header=None,
            quoting=csv.QUOTE_NONE,
            names=["fname", "raw_text", "normalized_text"])

    def get_example(self, i):
        fname, raw_text, normalized_text = self._table.iloc[i]
        fname = str(self._wav_dir.joinpath(fname + ".wav"))
        return fname, raw_text, normalized_text

    def __len__(self):
        return len(self._table)


class Transform(object):
    def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels):
        self.sample_rate = sample_rate
        self.n_fft = n_fft
        self.win_length = win_length
        self.hop_length = hop_length
        self.n_mels = n_mels

    def __call__(self, example):
        wav_path, _, _ = example

        sr = self.sample_rate
        n_fft = self.n_fft
        win_length = self.win_length
        hop_length = self.hop_length
        n_mels = self.n_mels

        wav, loaded_sr = librosa.load(wav_path, sr=None)
        assert loaded_sr == sr, "sample rate does not match, resampling applied"

        # Pad audio to the right size.
        frames = int(np.ceil(float(wav.size) / hop_length))
        fft_padding = (n_fft - hop_length) // 2  # sound
        desired_length = frames * hop_length + fft_padding * 2
        pad_amount = (desired_length - wav.size) // 2

        if wav.size % 2 == 0:
            wav = np.pad(wav, (pad_amount, pad_amount), mode='reflect')
        else:
            wav = np.pad(wav, (pad_amount, pad_amount + 1), mode='reflect')

        # Normalize audio.
        wav = wav / np.abs(wav).max() * 0.999

        # Compute mel-spectrogram.
        # Turn center to False to prevent internal padding.
        spectrogram = librosa.core.stft(
            wav,
            hop_length=hop_length,
            win_length=win_length,
            n_fft=n_fft,
            center=False)
        spectrogram_magnitude = np.abs(spectrogram)

        # Compute mel-spectrograms.
        mel_filter_bank = librosa.filters.mel(sr=sr,
                                              n_fft=n_fft,
                                              n_mels=n_mels)
        mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude)
        mel_spectrogram = mel_spectrogram

        # Rescale mel_spectrogram.
        min_level, ref_level = 1e-5, 20  # hard code it
        mel_spectrogram = 20 * np.log10(np.maximum(min_level, mel_spectrogram))
        mel_spectrogram = mel_spectrogram - ref_level
        mel_spectrogram = np.clip((mel_spectrogram + 100) / 100, 0, 1)

        # Extract the center of audio that corresponds to mel spectrograms.
        audio = wav[fft_padding:-fft_padding]
        assert mel_spectrogram.shape[1] * hop_length == audio.size

        # there is no clipping here
        return audio, mel_spectrogram


class DataCollector(object):
    def __init__(self,
                 context_size,
                 sample_rate,
                 hop_length,
                 train_clip_seconds,
                 valid=False):
        frames_per_second = sample_rate // hop_length
        train_clip_frames = int(
            np.ceil(train_clip_seconds * frames_per_second))
        context_frames = context_size // hop_length
        self.num_frames = train_clip_frames + context_frames

        self.sample_rate = sample_rate
        self.hop_length = hop_length
        self.valid = valid

    def random_crop(self, sample):
        audio, mel_spectrogram = sample
        audio_frames = int(audio.size) // self.hop_length
        max_start_frame = audio_frames - self.num_frames
        assert max_start_frame >= 0, "audio is too short to be cropped"

        frame_start = np.random.randint(0, max_start_frame)
        # frame_start = 0  # norandom
        frame_end = frame_start + self.num_frames

        audio_start = frame_start * self.hop_length
        audio_end = frame_end * self.hop_length

        audio = audio[audio_start:audio_end]
        return audio, mel_spectrogram, audio_start

    def __call__(self, samples):
        # transform them first
        if self.valid:
            samples = [(audio, mel_spectrogram, 0)
                       for audio, mel_spectrogram in samples]
        else:
            samples = [self.random_crop(sample) for sample in samples]
        # batch them
        audios = [sample[0] for sample in samples]
        audio_starts = [sample[2] for sample in samples]
        mels = [sample[1] for sample in samples]

        mels = batch_spec(mels)

        if self.valid:
            audios = batch_wav(audios, dtype=np.float32)
        else:
            audios = np.array(audios, dtype=np.float32)
        audio_starts = np.array(audio_starts, dtype=np.int64)
        return audios, mels, audio_starts