204 lines
7.4 KiB
Python
204 lines
7.4 KiB
Python
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from __future__ import division
|
|
import os
|
|
import csv
|
|
from pathlib import Path
|
|
import numpy as np
|
|
import pandas as pd
|
|
import librosa
|
|
from scipy import signal, io
|
|
import six
|
|
|
|
from parakeet.data import DatasetMixin, TransformDataset, FilterDataset
|
|
from parakeet.g2p.en import text_to_sequence, sequence_to_text
|
|
|
|
|
|
class LJSpeechMetaData(DatasetMixin):
|
|
def __init__(self, root):
|
|
self.root = Path(root)
|
|
self._wav_dir = self.root.joinpath("wavs")
|
|
csv_path = self.root.joinpath("metadata.csv")
|
|
self._table = pd.read_csv(
|
|
csv_path,
|
|
sep="|",
|
|
encoding="utf-8",
|
|
header=None,
|
|
quoting=csv.QUOTE_NONE,
|
|
names=["fname", "raw_text", "normalized_text"])
|
|
|
|
def get_example(self, i):
|
|
fname, raw_text, normalized_text = self._table.iloc[i]
|
|
fname = str(self._wav_dir.joinpath(fname + ".wav"))
|
|
return fname, raw_text, normalized_text
|
|
|
|
def __len__(self):
|
|
return len(self._table)
|
|
|
|
|
|
class Transform(object):
|
|
def __init__(self,
|
|
replace_pronounciation_prob=0.,
|
|
sample_rate=22050,
|
|
preemphasis=.97,
|
|
n_fft=1024,
|
|
win_length=1024,
|
|
hop_length=256,
|
|
fmin=125,
|
|
fmax=7600,
|
|
n_mels=80,
|
|
min_level_db=-100,
|
|
ref_level_db=20,
|
|
max_norm=0.999,
|
|
clip_norm=True):
|
|
self.replace_pronounciation_prob = replace_pronounciation_prob
|
|
|
|
self.sample_rate = sample_rate
|
|
self.preemphasis = preemphasis
|
|
self.n_fft = n_fft
|
|
self.win_length = win_length
|
|
self.hop_length = hop_length
|
|
|
|
self.fmin = fmin
|
|
self.fmax = fmax
|
|
self.n_mels = n_mels
|
|
|
|
self.min_level_db = min_level_db
|
|
self.ref_level_db = ref_level_db
|
|
self.max_norm = max_norm
|
|
self.clip_norm = clip_norm
|
|
|
|
def __call__(self, in_data):
|
|
fname, _, normalized_text = in_data
|
|
|
|
# text processing
|
|
mix_grapheme_phonemes = text_to_sequence(
|
|
normalized_text, self.replace_pronounciation_prob)
|
|
text_length = len(mix_grapheme_phonemes)
|
|
# CAUTION: positions start from 1
|
|
speaker_id = None
|
|
|
|
# wave processing
|
|
wav, _ = librosa.load(fname, sr=self.sample_rate)
|
|
# preemphasis
|
|
y = signal.lfilter([1., -self.preemphasis], [1.], wav)
|
|
|
|
# STFT
|
|
D = librosa.stft(
|
|
y=y,
|
|
n_fft=self.n_fft,
|
|
win_length=self.win_length,
|
|
hop_length=self.hop_length)
|
|
S = np.abs(D)
|
|
|
|
# to db and normalize to 0-1
|
|
amplitude_min = np.exp(self.min_level_db / 20 * np.log(10)) # 1e-5
|
|
S_norm = 20 * np.log10(np.maximum(amplitude_min,
|
|
S)) - self.ref_level_db
|
|
S_norm = (S_norm - self.min_level_db) / (-self.min_level_db)
|
|
S_norm = self.max_norm * S_norm
|
|
if self.clip_norm:
|
|
S_norm = np.clip(S_norm, 0, self.max_norm)
|
|
|
|
# mel scale and to db and normalize to 0-1,
|
|
# CAUTION: pass linear scale S, not dbscaled S
|
|
S_mel = librosa.feature.melspectrogram(
|
|
S=S, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, power=1.)
|
|
S_mel = 20 * np.log10(np.maximum(amplitude_min,
|
|
S_mel)) - self.ref_level_db
|
|
S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db)
|
|
S_mel_norm = self.max_norm * S_mel_norm
|
|
if self.clip_norm:
|
|
S_mel_norm = np.clip(S_mel_norm, 0, self.max_norm)
|
|
|
|
# num_frames
|
|
n_frames = S_mel_norm.shape[-1] # CAUTION: original number of frames
|
|
return (mix_grapheme_phonemes, text_length, speaker_id, S_norm,
|
|
S_mel_norm, n_frames)
|
|
|
|
|
|
class DataCollector(object):
|
|
def __init__(self, downsample_factor=4, r=1):
|
|
self.downsample_factor = int(downsample_factor)
|
|
self.frames_per_step = int(r)
|
|
self._factor = int(downsample_factor * r)
|
|
# CAUTION: small diff here
|
|
self._pad_begin = int(downsample_factor * r)
|
|
|
|
def __call__(self, examples):
|
|
batch_size = len(examples)
|
|
|
|
# lengths
|
|
text_lengths = np.array([example[1]
|
|
for example in examples]).astype(np.int64)
|
|
frames = np.array([example[5]
|
|
for example in examples]).astype(np.int64)
|
|
|
|
max_text_length = int(np.max(text_lengths))
|
|
max_frames = int(np.max(frames))
|
|
if max_frames % self._factor != 0:
|
|
max_frames += (self._factor - max_frames % self._factor)
|
|
max_frames += self._pad_begin
|
|
max_decoder_length = max_frames // self._factor
|
|
|
|
# pad time sequence
|
|
text_sequences = []
|
|
lin_specs = []
|
|
mel_specs = []
|
|
done_flags = []
|
|
for example in examples:
|
|
(mix_grapheme_phonemes, text_length, speaker_id, S_norm,
|
|
S_mel_norm, num_frames) = example
|
|
text_sequences.append(
|
|
np.pad(mix_grapheme_phonemes, (0, max_text_length - text_length
|
|
),
|
|
mode="constant"))
|
|
lin_specs.append(
|
|
np.pad(S_norm, ((0, 0), (self._pad_begin, max_frames -
|
|
self._pad_begin - num_frames)),
|
|
mode="constant"))
|
|
mel_specs.append(
|
|
np.pad(S_mel_norm, ((0, 0), (self._pad_begin, max_frames -
|
|
self._pad_begin - num_frames)),
|
|
mode="constant"))
|
|
done_flags.append(
|
|
np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),
|
|
(0, max_decoder_length - int(
|
|
np.ceil(num_frames // self._factor))),
|
|
mode="constant",
|
|
constant_values=1))
|
|
text_sequences = np.array(text_sequences).astype(np.int64)
|
|
lin_specs = np.transpose(np.array(lin_specs),
|
|
(0, 2, 1)).astype(np.float32)
|
|
mel_specs = np.transpose(np.array(mel_specs),
|
|
(0, 2, 1)).astype(np.float32)
|
|
done_flags = np.array(done_flags).astype(np.float32)
|
|
|
|
# text positions
|
|
text_mask = (np.arange(1, 1 + max_text_length) <= np.expand_dims(
|
|
text_lengths, -1)).astype(np.int64)
|
|
text_positions = np.arange(
|
|
1, 1 + max_text_length, dtype=np.int64) * text_mask
|
|
|
|
# decoder_positions
|
|
decoder_positions = np.tile(
|
|
np.expand_dims(
|
|
np.arange(
|
|
1, 1 + max_decoder_length, dtype=np.int64), 0),
|
|
(batch_size, 1))
|
|
|
|
return (text_sequences, text_lengths, text_positions, mel_specs,
|
|
lin_specs, frames, decoder_positions, done_flags)
|