ParakeetRebeccaRosario/examples/deepvoice3/data.py

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import division
import os
import csv
from pathlib import Path
import numpy as np
import pandas as pd
import librosa
from scipy import signal, io
import six

from parakeet.data import DatasetMixin, TransformDataset, FilterDataset
from parakeet.g2p.en import text_to_sequence, sequence_to_text


class LJSpeechMetaData(DatasetMixin):
    def __init__(self, root):
        self.root = Path(root)
        self._wav_dir = self.root.joinpath("wavs")
        csv_path = self.root.joinpath("metadata.csv")
        self._table = pd.read_csv(
            csv_path,
            sep="|",
            encoding="utf-8",
            header=None,
            quoting=csv.QUOTE_NONE,
            names=["fname", "raw_text", "normalized_text"])

    def get_example(self, i):
        fname, raw_text, normalized_text = self._table.iloc[i]
        fname = str(self._wav_dir.joinpath(fname + ".wav"))
        return fname, raw_text, normalized_text

    def __len__(self):
        return len(self._table)


class Transform(object):
    def __init__(self,
                 replace_pronounciation_prob=0.,
                 sample_rate=22050,
                 preemphasis=.97,
                 n_fft=1024,
                 win_length=1024,
                 hop_length=256,
                 fmin=125,
                 fmax=7600,
                 n_mels=80,
                 min_level_db=-100,
                 ref_level_db=20,
                 max_norm=0.999,
                 clip_norm=True):
        self.replace_pronounciation_prob = replace_pronounciation_prob

        self.sample_rate = sample_rate
        self.preemphasis = preemphasis
        self.n_fft = n_fft
        self.win_length = win_length
        self.hop_length = hop_length

        self.fmin = fmin
        self.fmax = fmax
        self.n_mels = n_mels

        self.min_level_db = min_level_db
        self.ref_level_db = ref_level_db
        self.max_norm = max_norm
        self.clip_norm = clip_norm

    def __call__(self, in_data):
        fname, _, normalized_text = in_data

        # text processing
        mix_grapheme_phonemes = text_to_sequence(
            normalized_text, self.replace_pronounciation_prob)
        text_length = len(mix_grapheme_phonemes)
        # CAUTION: positions start from 1
        speaker_id = None

        # wave processing
        wav, _ = librosa.load(fname, sr=self.sample_rate)
        # preemphasis
        y = signal.lfilter([1., -self.preemphasis], [1.], wav)

        # STFT
        D = librosa.stft(
            y=y,
            n_fft=self.n_fft,
            win_length=self.win_length,
            hop_length=self.hop_length)
        S = np.abs(D)

        # to db and normalize to 0-1
        amplitude_min = np.exp(self.min_level_db / 20 * np.log(10))  # 1e-5
        S_norm = 20 * np.log10(np.maximum(amplitude_min,
                                          S)) - self.ref_level_db
        S_norm = (S_norm - self.min_level_db) / (-self.min_level_db)
        S_norm = self.max_norm * S_norm
        if self.clip_norm:
            S_norm = np.clip(S_norm, 0, self.max_norm)

        # mel scale and to db and normalize to 0-1,
        # CAUTION: pass linear scale S, not dbscaled S
        S_mel = librosa.feature.melspectrogram(
            S=S, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, power=1.)
        S_mel = 20 * np.log10(np.maximum(amplitude_min,
                                         S_mel)) - self.ref_level_db
        S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db)
        S_mel_norm = self.max_norm * S_mel_norm
        if self.clip_norm:
            S_mel_norm = np.clip(S_mel_norm, 0, self.max_norm)

        # num_frames
        n_frames = S_mel_norm.shape[-1]  # CAUTION: original number of frames
        return (mix_grapheme_phonemes, text_length, speaker_id, S_norm,
                S_mel_norm, n_frames)


class DataCollector(object):
    def __init__(self, downsample_factor=4, r=1):
        self.downsample_factor = int(downsample_factor)
        self.frames_per_step = int(r)
        self._factor = int(downsample_factor * r)
        # CAUTION: small diff here
        self._pad_begin = int(downsample_factor * r)

    def __call__(self, examples):
        batch_size = len(examples)

        # lengths
        text_lengths = np.array([example[1]
                                 for example in examples]).astype(np.int64)
        frames = np.array([example[5]
                           for example in examples]).astype(np.int64)

        max_text_length = int(np.max(text_lengths))
        max_frames = int(np.max(frames))
        if max_frames % self._factor != 0:
            max_frames += (self._factor - max_frames % self._factor)
        max_frames += self._pad_begin
        max_decoder_length = max_frames // self._factor

        # pad time sequence
        text_sequences = []
        lin_specs = []
        mel_specs = []
        done_flags = []
        for example in examples:
            (mix_grapheme_phonemes, text_length, speaker_id, S_norm,
             S_mel_norm, num_frames) = example
            text_sequences.append(
                np.pad(mix_grapheme_phonemes, (0, max_text_length - text_length
                                               ),
                       mode="constant"))
            lin_specs.append(
                np.pad(S_norm, ((0, 0), (self._pad_begin, max_frames -
                                         self._pad_begin - num_frames)),
                       mode="constant"))
            mel_specs.append(
                np.pad(S_mel_norm, ((0, 0), (self._pad_begin, max_frames -
                                             self._pad_begin - num_frames)),
                       mode="constant"))
            done_flags.append(
                np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),
                       (0, max_decoder_length - int(
                           np.ceil(num_frames // self._factor))),
                       mode="constant",
                       constant_values=1))
        text_sequences = np.array(text_sequences).astype(np.int64)
        lin_specs = np.transpose(np.array(lin_specs),
                                 (0, 2, 1)).astype(np.float32)
        mel_specs = np.transpose(np.array(mel_specs),
                                 (0, 2, 1)).astype(np.float32)
        done_flags = np.array(done_flags).astype(np.float32)

        # text positions
        text_mask = (np.arange(1, 1 + max_text_length) <= np.expand_dims(
            text_lengths, -1)).astype(np.int64)
        text_positions = np.arange(
            1, 1 + max_text_length, dtype=np.int64) * text_mask

        # decoder_positions
        decoder_positions = np.tile(
            np.expand_dims(
                np.arange(
                    1, 1 + max_decoder_length, dtype=np.int64), 0),
            (batch_size, 1))

        return (text_sequences, text_lengths, text_positions, mel_specs,
                lin_specs, frames, decoder_positions, done_flags)
add license 2020-02-26 21:03:51 +08:00			`# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

fix for compatability of python2 and lower versions of numpy 2020-03-10 16:17:56 +08:00			`from __future__ import division`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00			`import os`
			`import csv`
			`from pathlib import Path`
			`import numpy as np`
			`import pandas as pd`
			`import librosa`
			`from scipy import signal, io`
fix for compatability of python2 and lower versions of numpy 2020-03-10 16:17:56 +08:00			`import six`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00
			`from parakeet.data import DatasetMixin, TransformDataset, FilterDataset`
			`from parakeet.g2p.en import text_to_sequence, sequence_to_text`


			`class LJSpeechMetaData(DatasetMixin):`
			`def __init__(self, root):`
			`self.root = Path(root)`
			`self._wav_dir = self.root.joinpath("wavs")`
			`csv_path = self.root.joinpath("metadata.csv")`
			`self._table = pd.read_csv(`
			`csv_path,`
			`sep="\|",`
fix for compatability of python2 and lower versions of numpy 2020-03-10 16:17:56 +08:00			`encoding="utf-8",`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00			`header=None,`
			`quoting=csv.QUOTE_NONE,`
			`names=["fname", "raw_text", "normalized_text"])`

			`def get_example(self, i):`
			`fname, raw_text, normalized_text = self._table.iloc[i]`
			`fname = str(self._wav_dir.joinpath(fname + ".wav"))`
			`return fname, raw_text, normalized_text`

			`def __len__(self):`
			`return len(self._table)`


			`class Transform(object):`
			`def __init__(self,`
			`replace_pronounciation_prob=0.,`
			`sample_rate=22050,`
			`preemphasis=.97,`
			`n_fft=1024,`
			`win_length=1024,`
			`hop_length=256,`
			`fmin=125,`
			`fmax=7600,`
			`n_mels=80,`
			`min_level_db=-100,`
			`ref_level_db=20,`
			`max_norm=0.999,`
			`clip_norm=True):`
			`self.replace_pronounciation_prob = replace_pronounciation_prob`

			`self.sample_rate = sample_rate`
			`self.preemphasis = preemphasis`
			`self.n_fft = n_fft`
			`self.win_length = win_length`
			`self.hop_length = hop_length`

			`self.fmin = fmin`
			`self.fmax = fmax`
			`self.n_mels = n_mels`

			`self.min_level_db = min_level_db`
			`self.ref_level_db = ref_level_db`
			`self.max_norm = max_norm`
			`self.clip_norm = clip_norm`

			`def __call__(self, in_data):`
			`fname, _, normalized_text = in_data`

			`# text processing`
			`mix_grapheme_phonemes = text_to_sequence(`
			`normalized_text, self.replace_pronounciation_prob)`
			`text_length = len(mix_grapheme_phonemes)`
			`# CAUTION: positions start from 1`
			`speaker_id = None`

			`# wave processing`
			`wav, _ = librosa.load(fname, sr=self.sample_rate)`
			`# preemphasis`
			`y = signal.lfilter([1., -self.preemphasis], [1.], wav)`

			`# STFT`
add license 2020-02-26 21:03:51 +08:00			`D = librosa.stft(`
			`y=y,`
			`n_fft=self.n_fft,`
			`win_length=self.win_length,`
			`hop_length=self.hop_length)`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00			`S = np.abs(D)`

			`# to db and normalize to 0-1`
			`amplitude_min = np.exp(self.min_level_db / 20 * np.log(10)) # 1e-5`
			`S_norm = 20 * np.log10(np.maximum(amplitude_min,`
			`S)) - self.ref_level_db`
			`S_norm = (S_norm - self.min_level_db) / (-self.min_level_db)`
			`S_norm = self.max_norm * S_norm`
			`if self.clip_norm:`
			`S_norm = np.clip(S_norm, 0, self.max_norm)`

			`# mel scale and to db and normalize to 0-1,`
			`# CAUTION: pass linear scale S, not dbscaled S`
add license 2020-02-26 21:03:51 +08:00			`S_mel = librosa.feature.melspectrogram(`
			`S=S, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, power=1.)`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00			`S_mel = 20 * np.log10(np.maximum(amplitude_min,`
			`S_mel)) - self.ref_level_db`
			`S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db)`
			`S_mel_norm = self.max_norm * S_mel_norm`
			`if self.clip_norm:`
			`S_mel_norm = np.clip(S_mel_norm, 0, self.max_norm)`

			`# num_frames`
			`n_frames = S_mel_norm.shape[-1] # CAUTION: original number of frames`
			`return (mix_grapheme_phonemes, text_length, speaker_id, S_norm,`
			`S_mel_norm, n_frames)`


			`class DataCollector(object):`
			`def __init__(self, downsample_factor=4, r=1):`
			`self.downsample_factor = int(downsample_factor)`
			`self.frames_per_step = int(r)`
			`self._factor = int(downsample_factor * r)`
			`# CAUTION: small diff here`
			`self._pad_begin = int(downsample_factor * r)`

			`def __call__(self, examples):`
			`batch_size = len(examples)`

			`# lengths`
			`text_lengths = np.array([example[1]`
			`for example in examples]).astype(np.int64)`
			`frames = np.array([example[5]`
			`for example in examples]).astype(np.int64)`

			`max_text_length = int(np.max(text_lengths))`
			`max_frames = int(np.max(frames))`
			`if max_frames % self._factor != 0:`
			`max_frames += (self._factor - max_frames % self._factor)`
			`max_frames += self._pad_begin`
			`max_decoder_length = max_frames // self._factor`

			`# pad time sequence`
			`text_sequences = []`
			`lin_specs = []`
			`mel_specs = []`
			`done_flags = []`
			`for example in examples:`
			`(mix_grapheme_phonemes, text_length, speaker_id, S_norm,`
			`S_mel_norm, num_frames) = example`
			`text_sequences.append(`
add license 2020-02-26 21:03:51 +08:00			`np.pad(mix_grapheme_phonemes, (0, max_text_length - text_length`
fix for compatability of python2 and lower versions of numpy 2020-03-10 16:17:56 +08:00			`),`
			`mode="constant"))`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00			`lin_specs.append(`
add license 2020-02-26 21:03:51 +08:00			`np.pad(S_norm, ((0, 0), (self._pad_begin, max_frames -`
fix for compatability of python2 and lower versions of numpy 2020-03-10 16:17:56 +08:00			`self._pad_begin - num_frames)),`
			`mode="constant"))`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00			`mel_specs.append(`
add license 2020-02-26 21:03:51 +08:00			`np.pad(S_mel_norm, ((0, 0), (self._pad_begin, max_frames -`
fix for compatability of python2 and lower versions of numpy 2020-03-10 16:17:56 +08:00			`self._pad_begin - num_frames)),`
			`mode="constant"))`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00			`done_flags.append(`
			`np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),`
add license 2020-02-26 21:03:51 +08:00			`(0, max_decoder_length - int(`
			`np.ceil(num_frames // self._factor))),`
fix for compatability of python2 and lower versions of numpy 2020-03-10 16:17:56 +08:00			`mode="constant",`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00			`constant_values=1))`
			`text_sequences = np.array(text_sequences).astype(np.int64)`
			`lin_specs = np.transpose(np.array(lin_specs),`
			`(0, 2, 1)).astype(np.float32)`
			`mel_specs = np.transpose(np.array(mel_specs),`
			`(0, 2, 1)).astype(np.float32)`
			`done_flags = np.array(done_flags).astype(np.float32)`

			`# text positions`
			`text_mask = (np.arange(1, 1 + max_text_length) <= np.expand_dims(`
			`text_lengths, -1)).astype(np.int64)`
fix integer data type for deepvoice3's data loader 2020-03-19 11:26:46 +08:00			`text_positions = np.arange(`
			`1, 1 + max_text_length, dtype=np.int64) * text_mask`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00
			`# decoder_positions`
			`decoder_positions = np.tile(`
fix integer data type for deepvoice3's data loader 2020-03-19 11:26:46 +08:00			`np.expand_dims(`
			`np.arange(`
			`1, 1 + max_decoder_length, dtype=np.int64), 0),`
add deepvoice3 model and example 2020-02-13 10:24:34 +08:00			`(batch_size, 1))`

			`return (text_sequences, text_lengths, text_positions, mel_specs,`
			`lin_specs, frames, decoder_positions, done_flags)`