ParakeetRebeccaRosario/examples/parallelwave_gan/baker/batch_fn.py

106 lines
3.8 KiB
Python
Raw Normal View History

2021-06-13 17:00:44 +08:00
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
class Clip(object):
"""Collate functor for training vocoders.
"""
def __init__(
self,
batch_max_steps=20480,
hop_size=256,
aux_context_window=0, ):
"""Initialize customized collater for PyTorch DataLoader.
Args:
batch_max_steps (int): The maximum length of input signal in batch.
hop_size (int): Hop size of auxiliary features.
aux_context_window (int): Context window size for auxiliary feature conv.
"""
if batch_max_steps % hop_size != 0:
batch_max_steps += -(batch_max_steps % hop_size)
assert batch_max_steps % hop_size == 0
self.batch_max_steps = batch_max_steps
self.batch_max_frames = batch_max_steps // hop_size
self.hop_size = hop_size
self.aux_context_window = aux_context_window
# set useful values in random cutting
self.start_offset = aux_context_window
self.end_offset = -(self.batch_max_frames + aux_context_window)
self.mel_threshold = self.batch_max_frames + 2 * aux_context_window
def __call__(self, examples):
"""Convert into batch tensors.
Args:
batch (list): list of tuple of the pair of audio and features.
Returns:
Tensor: Auxiliary feature batch (B, C, T'), where
T = (T' - 2 * aux_context_window) * hop_size.
Tensor: Target signal batch (B, 1, T).
"""
# check length
examples = [
self._adjust_length(*b) for b in examples
if len(b[1]) > self.mel_threshold
]
xs, cs = [b[0] for b in examples], [b[1] for b in examples]
# make batch with random cut
c_lengths = [len(c) for c in cs]
start_frames = np.array([
np.random.randint(self.start_offset, cl + self.end_offset)
for cl in c_lengths
])
x_starts = start_frames * self.hop_size
x_ends = x_starts + self.batch_max_steps
c_starts = start_frames - self.aux_context_window
c_ends = start_frames + self.batch_max_frames + self.aux_context_window
y_batch = [x[start:end] for x, start, end in zip(xs, x_starts, x_ends)]
c_batch = [c[start:end] for c, start, end in zip(cs, c_starts, c_ends)]
# convert each batch to tensor, asuume that each item in batch has the same length
y_batch = paddle.to_tensor(
y_batch, dtype=paddle.float32).unsqueeze(1) # (B, 1, T)
c_batch = paddle.to_tensor(
c_batch, dtype=paddle.float32).transpose([0, 2, 1]) # (B, C, T')
return (c_batch, ), y_batch
def _adjust_length(self, x, c):
"""Adjust the audio and feature lengths.
Note:
Basically we assume that the length of x and c are adjusted
through preprocessing stage, but if we use other library processed
features, this process will be needed.
"""
if len(x) < len(c) * self.hop_size:
x = np.pad(x, (0, len(c) * self.hop_size - len(x)), mode="edge")
# check the legnth is valid
assert len(x) == len(c) * self.hop_size
return x, c