From b571b506c34f2bd431b3525d9f723396a596cc74 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 10 Jun 2021 00:25:00 +0800 Subject: [PATCH 01/47] add kbest --- parakeet/training/chekpoint.py | 97 ++++++++++++++++++++++++++++++++++ tests/test_checkpoint.py | 34 ++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 parakeet/training/chekpoint.py create mode 100644 tests/test_checkpoint.py diff --git a/parakeet/training/chekpoint.py b/parakeet/training/chekpoint.py new file mode 100644 index 0000000..2e327c4 --- /dev/null +++ b/parakeet/training/chekpoint.py @@ -0,0 +1,97 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator +from queue import PriorityQueue +from typing import Callable, Mapping +from pathlib import Path + + +class KBest(object): + """ + A utility class to help save the hard drive by only keeping K best + checkpoints. + + To be as modularized as possible, this class does not assume anything like + a Trainer class or anything like a checkpoint directory, it does not know + about the model or the optimizer, etc. + + It is basically a dynamically mantained K-bset Mapping. When a new item is + added to the map, save_fn is called. And when an item is remove from the + map, del_fn is called. `save_fn` and `del_fn` takes a Path object as input + and returns nothing. + + Though it is designed to control checkpointing behaviors, it can be used + to do something else if you pass some save_fn and del_fn. + + Example + -------- + + >>> from pathlib import Path + >>> import shutil + >>> import paddle + >>> from paddle import nn + + >>> model = nn.Linear(2, 3) + >>> def save_model(path): + ... paddle.save(model.state_dict()) + + >>> kbest_manager = KBest(max_size=5, save_fn=save_model) + >>> checkpoint_dir = Path("checkpoints") + >>> shutil.rmtree(checkpoint_dir) + >>> checkpoint_dir.mkdir(parents=True) + >>> a = np.random.rand(20) + >>> for i, score in enumerate(a): + ... path = checkpoint_dir / f"step_{i}" + ... kbest_manager.add_checkpoint(score, path) + >>> assert len(list(checkpoint_dir.glob("step_*"))) == 5 + """ + + def __init__(self, + max_size: int=5, + save_fn: Callable=None, + del_fn: Callable=lambda f: f.unlink()): + self.best_records: Mapping[Path, float] = {} + self.save_fn = save_fn + self.del_fn = del_fn + self.max_size = max_size + self._save_all = (max_size == -1) + + def should_save(self, metric: float) -> bool: + if not self.full(): + return True + + # already full + worst_record_path = max(self.best_records, key=self.best_records.get) + worst_metric = self.best_records[worst_record_path] + return metric < worst_metric + + def full(self): + return (not self._save_all) and len(self.best_records) == self.max_size + + def add_checkpoint(self, metric, path): + if self.should_save(metric): + self.save_checkpoint_and_update(metric, path) + + def save_checkpoint_and_update(self, metric, path): + # remove the worst + if self.full(): + worst_record_path = max(self.best_records, + key=self.best_records.get) + self.best_records.pop(worst_record_path) + self.del_fn(path) + + # add the new one + self.save_fn(path) + self.best_records[path] = metric diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py new file mode 100644 index 0000000..56088ea --- /dev/null +++ b/tests/test_checkpoint.py @@ -0,0 +1,34 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from parakeet.training.chekpoint import KBest +import numpy as np +from pathlib import Path +import shutil + + +def test_kbest(): + def save_fn(path): + with open(path, 'wt') as f: + f.write(f"My path is {str(path)}\n") + + kbest_manager = KBest(max_size=5, save_fn=save_fn) + checkpoint_dir = Path("checkpoints") + shutil.rmtree(checkpoint_dir) + checkpoint_dir.mkdir(parents=True) + a = np.random.rand(20) + for i, score in enumerate(a): + path = checkpoint_dir / f"step_{i}" + kbest_manager.add_checkpoint(score, path) + assert len(list(checkpoint_dir.glob("step_*"))) == 5 From 37a66f1506d00b1531db0f8932b044cf1565ce89 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 10 Jun 2021 00:29:25 +0800 Subject: [PATCH 02/47] fix typos --- parakeet/training/chekpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parakeet/training/chekpoint.py b/parakeet/training/chekpoint.py index 2e327c4..0cfdd5b 100644 --- a/parakeet/training/chekpoint.py +++ b/parakeet/training/chekpoint.py @@ -28,7 +28,7 @@ class KBest(object): about the model or the optimizer, etc. It is basically a dynamically mantained K-bset Mapping. When a new item is - added to the map, save_fn is called. And when an item is remove from the + added to the map, save_fn is called. And when an item is removed from the map, del_fn is called. `save_fn` and `del_fn` takes a Path object as input and returns nothing. From 13323bdf6a120b8a270ec8ca823708b39705d507 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 10 Jun 2021 00:32:25 +0800 Subject: [PATCH 03/47] remove unnecessary imports --- parakeet/training/chekpoint.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/parakeet/training/chekpoint.py b/parakeet/training/chekpoint.py index 0cfdd5b..330834a 100644 --- a/parakeet/training/chekpoint.py +++ b/parakeet/training/chekpoint.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import operator -from queue import PriorityQueue from typing import Callable, Mapping from pathlib import Path @@ -42,7 +40,7 @@ class KBest(object): >>> import shutil >>> import paddle >>> from paddle import nn - + >>> model = nn.Linear(2, 3) >>> def save_model(path): ... paddle.save(model.state_dict()) From 0114a808a26358be70153520912a14df1daebeb6 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 10 Jun 2021 00:40:54 +0800 Subject: [PATCH 04/47] fix del fn --- parakeet/training/chekpoint.py | 2 +- tests/test_checkpoint.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/parakeet/training/chekpoint.py b/parakeet/training/chekpoint.py index 330834a..8a5fe4f 100644 --- a/parakeet/training/chekpoint.py +++ b/parakeet/training/chekpoint.py @@ -88,7 +88,7 @@ class KBest(object): worst_record_path = max(self.best_records, key=self.best_records.get) self.best_records.pop(worst_record_path) - self.del_fn(path) + self.del_fn(worst_record_path) # add the new one self.save_fn(path) diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py index 56088ea..45137d5 100644 --- a/tests/test_checkpoint.py +++ b/tests/test_checkpoint.py @@ -23,7 +23,8 @@ def test_kbest(): with open(path, 'wt') as f: f.write(f"My path is {str(path)}\n") - kbest_manager = KBest(max_size=5, save_fn=save_fn) + K = 1 + kbest_manager = KBest(max_size=K, save_fn=save_fn) checkpoint_dir = Path("checkpoints") shutil.rmtree(checkpoint_dir) checkpoint_dir.mkdir(parents=True) @@ -31,4 +32,4 @@ def test_kbest(): for i, score in enumerate(a): path = checkpoint_dir / f"step_{i}" kbest_manager.add_checkpoint(score, path) - assert len(list(checkpoint_dir.glob("step_*"))) == 5 + assert len(list(checkpoint_dir.glob("step_*"))) == K From dc9040dd4d89230e821a9bf019c2d6ad5a5bfe14 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 10 Jun 2021 02:45:45 +0800 Subject: [PATCH 05/47] fix typos --- parakeet/training/{chekpoint.py => checkpoint.py} | 4 ++-- tests/test_checkpoint.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) rename parakeet/training/{chekpoint.py => checkpoint.py} (96%) diff --git a/parakeet/training/chekpoint.py b/parakeet/training/checkpoint.py similarity index 96% rename from parakeet/training/chekpoint.py rename to parakeet/training/checkpoint.py index 8a5fe4f..155261d 100644 --- a/parakeet/training/chekpoint.py +++ b/parakeet/training/checkpoint.py @@ -58,8 +58,8 @@ class KBest(object): def __init__(self, max_size: int=5, - save_fn: Callable=None, - del_fn: Callable=lambda f: f.unlink()): + save_fn: Callable[[Path], None]=None, + del_fn: Callable[[Path], None]=lambda f: f.unlink()): self.best_records: Mapping[Path, float] = {} self.save_fn = save_fn self.del_fn = del_fn diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py index 45137d5..af8df02 100644 --- a/tests/test_checkpoint.py +++ b/tests/test_checkpoint.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from parakeet.training.chekpoint import KBest +from parakeet.training.checkpoint import KBest import numpy as np from pathlib import Path import shutil From c306f5c2b339f5b7fd9874e1b3428c45fe10f9e8 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 10 Jun 2021 03:39:54 +0800 Subject: [PATCH 06/47] add k-latest --- parakeet/training/checkpoint.py | 71 ++++++++++++++++++++++++++++++++- tests/test_checkpoint.py | 21 +++++++++- 2 files changed, 88 insertions(+), 4 deletions(-) diff --git a/parakeet/training/checkpoint.py b/parakeet/training/checkpoint.py index 155261d..4bb12e2 100644 --- a/parakeet/training/checkpoint.py +++ b/parakeet/training/checkpoint.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Callable, Mapping +from typing import Callable, Mapping, List from pathlib import Path @@ -43,7 +43,7 @@ class KBest(object): >>> model = nn.Linear(2, 3) >>> def save_model(path): - ... paddle.save(model.state_dict()) + ... paddle.save(model.state_dict(), path) >>> kbest_manager = KBest(max_size=5, save_fn=save_model) >>> checkpoint_dir = Path("checkpoints") @@ -93,3 +93,70 @@ class KBest(object): # add the new one self.save_fn(path) self.best_records[path] = metric + + +class KLatest(object): + """ + A utility class to help save the hard drive by only keeping K latest + checkpoints. + + To be as modularized as possible, this class does not assume anything like + a Trainer class or anything like a checkpoint directory, it does not know + about the model or the optimizer, etc. + + It is basically a dynamically mantained Queue. When a new item is + added to the queue, save_fn is called. And when an item is removed from the + queue, del_fn is called. `save_fn` and `del_fn` takes a Path object as input + and returns nothing. + + Though it is designed to control checkpointing behaviors, it can be used + to do something else if you pass some save_fn and del_fn. + + Example + -------- + + >>> from pathlib import Path + >>> import shutil + >>> import paddle + >>> from paddle import nn + + >>> model = nn.Linear(2, 3) + >>> def save_model(path): + ... paddle.save(model.state_dict(), path) + + >>> klatest_manager = KLatest(max_size=5, save_fn=save_model) + >>> checkpoint_dir = Path("checkpoints") + >>> shutil.rmtree(checkpoint_dir) + >>> checkpoint_dir.mkdir(parents=True) + >>> for i in range(20): + ... path = checkpoint_dir / f"step_{i}" + ... klatest_manager.add_checkpoint(path) + >>> assert len(list(checkpoint_dir.glob("step_*"))) == 5 + """ + + def __init__(self, + max_size: int=5, + save_fn: Callable[[Path], None]=None, + del_fn: Callable[[Path], None]=lambda f: f.unlink()): + self.latest_records: List[Path] = [] + self.save_fn = save_fn + self.del_fn = del_fn + self.max_size = max_size + self._save_all = (max_size == -1) + + def full(self): + return ( + not self._save_all) and len(self.latest_records) == self.max_size + + def add_checkpoint(self, path): + self.save_checkpoint_and_update(path) + + def save_checkpoint_and_update(self, path): + # remove the earist + if self.full(): + eariest_record_path = self.latest_records.pop(0) + self.del_fn(eariest_record_path) + + # add the new one + self.save_fn(path) + self.latest_records.append(path) diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py index af8df02..9173033 100644 --- a/tests/test_checkpoint.py +++ b/tests/test_checkpoint.py @@ -12,11 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from parakeet.training.checkpoint import KBest -import numpy as np from pathlib import Path import shutil +import numpy as np +from parakeet.training.checkpoint import KBest, KLatest + def test_kbest(): def save_fn(path): @@ -33,3 +34,19 @@ def test_kbest(): path = checkpoint_dir / f"step_{i}" kbest_manager.add_checkpoint(score, path) assert len(list(checkpoint_dir.glob("step_*"))) == K + + +def test_klatest(): + def save_fn(path): + with open(path, 'wt') as f: + f.write(f"My path is {str(path)}\n") + + K = 5 + klatest_manager = KLatest(max_size=K, save_fn=save_fn) + checkpoint_dir = Path("checkpoints") + shutil.rmtree(checkpoint_dir) + checkpoint_dir.mkdir(parents=True) + for i in range(20): + path = checkpoint_dir / f"step_{i}" + klatest_manager.add_checkpoint(path) + assert len(list(checkpoint_dir.glob("step_*"))) == K From 759999c738978a2870079b87e4acaac83128e6e5 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 10 Jun 2021 04:06:06 +0800 Subject: [PATCH 07/47] STFT and MelScale: register filters as buffer. --- parakeet/modules/audio.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/parakeet/modules/audio.py b/parakeet/modules/audio.py index 16c64a4..c44aa66 100644 --- a/parakeet/modules/audio.py +++ b/parakeet/modules/audio.py @@ -20,7 +20,7 @@ import librosa from librosa.util import pad_center import numpy as np -__all__ = ["quantize", "dequantize", "STFT"] +__all__ = ["quantize", "dequantize", "STFT", "MelScale"] def quantize(values, n_bands): @@ -96,10 +96,10 @@ class STFT(nn.Layer): Defaults to True. pad_mode : string or function - If center=True, this argument is passed to np.pad for padding the edges - of the signal y. By default (pad_mode="reflect"), y is padded on both - sides with its own reflection, mirrored around its first and last - sample respectively. If center=False, this argument is ignored. + If center=True, this argument is passed to np.pad for padding the edges + of the signal y. By default (pad_mode="reflect"), y is padded on both + sides with its own reflection, mirrored around its first and last + sample respectively. If center=False, this argument is ignored. @@ -163,17 +163,15 @@ class STFT(nn.Layer): w = np.concatenate([w_real, w_imag], axis=0) w = w * window w = np.expand_dims(w, 1) - self.weight = paddle.cast( - paddle.to_tensor(w), paddle.get_default_dtype()) + weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype()) + self.register_buffer("weight", weight) def forward(self, x): """Compute the stft transform. - Parameters ------------ x : Tensor [shape=(B, T)] The input waveform. - Returns ------------ real : Tensor [shape=(B, C, frames)] @@ -195,36 +193,32 @@ class STFT(nn.Layer): def power(self, x): """Compute the power spectrum. - Parameters ------------ x : Tensor [shape=(B, T)] The input waveform. - Returns ------------ Tensor [shape=(B, C, T)] The power spectrum. """ - real, imag = self(x) + real, imag = self.forward(x) power = real**2 + imag**2 return power def magnitude(self, x): """Compute the magnitude of the spectrum. - Parameters ------------ x : Tensor [shape=(B, T)] The input waveform. - Returns ------------ Tensor [shape=(B, C, T)] The magnitude of the spectrum. """ power = self.power(x) - magnitude = paddle.sqrt(power) + magnitude = paddle.sqrt(power) # TODO(chenfeiyu): maybe clipping return magnitude @@ -232,7 +226,9 @@ class MelScale(nn.Layer): def __init__(self, sr, n_fft, n_mels, fmin, fmax): super().__init__() mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax) - self.weight = paddle.to_tensor(mel_basis) + # self.weight = paddle.to_tensor(mel_basis) + weight = paddle.to_tensor(mel_basis, dtype=paddle.get_default_dtype()) + self.register_buffer("weight", weight) def forward(self, spec): # (n_mels, n_freq) * (batch_size, n_freq, n_frames) From 3c964fde54ee88ae01cbbdd74a887dcf36fe534d Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 10 Jun 2021 04:08:05 +0800 Subject: [PATCH 08/47] add parallel wavegan model --- parakeet/models/parallel_wavegan.py | 438 ++++++++++++++++++++++++++++ parakeet/modules/stft_loss.py | 138 +++++++++ 2 files changed, 576 insertions(+) create mode 100644 parakeet/models/parallel_wavegan.py create mode 100644 parakeet/modules/stft_loss.py diff --git a/parakeet/models/parallel_wavegan.py b/parakeet/models/parallel_wavegan.py new file mode 100644 index 0000000..3ed6058 --- /dev/null +++ b/parakeet/models/parallel_wavegan.py @@ -0,0 +1,438 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import numpy as np +import paddle +from paddle import nn +from paddle.nn import functional as F + + +class Stretch2D(nn.Layer): + def __init__(self, x_scale, y_scale, mode="nearest"): + super().__init__() + self.x_scale = x_scale + self.y_scale = y_scale + self.mode = mode + + def forward(self, x): + out = F.interpolate( + x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode) + return out + + +class UpsampleNet(nn.Layer): + def __init__(self, + upsample_scales, + nonlinear_activation=None, + nonlinear_activation_params={}, + interpolate_mode="nearest", + freq_axis_kernel_size=1, + use_causal_conv=False): + super().__init__() + self.use_causal_conv = use_causal_conv + self.up_layers = nn.LayerList() + for scale in upsample_scales: + stretch = Stretch2D(scale, 1, interpolate_mode) + assert freq_axis_kernel_size % 2 == 1 + freq_axis_padding = (freq_axis_kernel_size - 1) // 2 + kernel_size = (freq_axis_kernel_size, scale * 2 + 1) + if use_causal_conv: + padding = (freq_axis_padding, scale * 2) + else: + padding = (freq_axis_padding, scale) + conv = nn.Conv2D( + 1, 1, kernel_size, padding=padding, bias_attr=False) + if nonlinear_activation is not None: + nonlinear = getattr( + nn, nonlinear_activation)(**nonlinear_activation_params) + self.up_layers.extend([stretch, conv, nonlinear]) + + def forward(self, c): + c = c.unsqueeze(1) + for f in self.up_layers: + if self.use_causal_conv and isinstance(f, nn.Conv2D): + c = f(c)[:, :, :, c.shape[-1]] + else: + c = f(c) + return c.squeeze(1) + + +class ConvInUpsampleNet(nn.Layer): + def __init__(self, + upsample_scales, + nonlinear_activation=None, + nonlinear_activation_params={}, + interpolate_mode="nearest", + freq_axis_kernel_size=1, + aux_channels=80, + aux_context_window=0, + use_causal_conv=False): + super().__init__() + self.aux_context_window = aux_context_window + self.use_causal_conv = use_causal_conv and aux_context_window > 0 + kernel_size = aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1 + self.conv_in = nn.Conv1D( + aux_channels, + aux_channels, + kernel_size=kernel_size, + bias_attr=False) + self.upsample = UpsampleNet( + upsample_scales=upsample_scales, + nonlinear_activation=nonlinear_activation, + nonlinear_activation_params=nonlinear_activation_params, + interpolate_mode=interpolate_mode, + freq_axis_kernel_size=freq_axis_kernel_size, + use_causal_conv=use_causal_conv) + + def forward(self, c): + c_ = self.conv_in(c) + c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_ + return self.upsample(c) + + +class ResidualBlock(nn.Layer): + def __init__(self, + kernel_size=3, + residual_channels=64, + gate_channels=128, + skip_channels=64, + aux_channels=80, + dropout=0., + dilation=1, + bias=True, + use_causal_conv=False): + super().__init__() + self.dropout = dropout + if use_causal_conv: + padding = (kernel_size - 1) * dilation + else: + assert kernel_size % 2 == 1 + padding = (kernel_size - 1) // 2 * dilation + self.use_causal_conv = use_causal_conv + + self.conv = nn.Conv1D( + residual_channels, + gate_channels, + kernel_size, + padding=padding, + dilation=dilation, + bias_attr=bias) + if aux_channels is not None: + self.conv1x1_aux = nn.Conv1D( + aux_channels, gate_channels, kernel_size=1, bias_attr=False) + else: + self.conv1x1_aux = None + + gate_out_channels = gate_channels // 2 + self.conv1x1_out = nn.Conv1D( + gate_out_channels, + residual_channels, + kernel_size=1, + bias_attr=bias) + self.conv1x1_skip = nn.Conv1D( + gate_out_channels, skip_channels, kernel_size=1, bias_attr=bias) + + def forward(self, x, c): + x_input = x + x = F.dropout(x, self.dropout, training=self.training) + x = self.conv(x) + x = x[:, :, x_input.shape[-1]] if self.use_causal_conv else x + if c is not None: + c = self.conv1x1_aux(c) + x += c + + a, b = paddle.chunk(x, 2, axis=1) + x = paddle.tanh(a) * F.sigmoid(b) + + skip = self.conv1x1_skip(x) + res = (self.conv1x1_out(x) + x_input) * math.sqrt(0.5) + return res, skip + + +class PWGGenerator(nn.Layer): + def __init__(self, + in_channels=1, + out_channels=1, + kernel_size=3, + layers=30, + stacks=3, + residual_channels=64, + gate_channels=128, + skip_channels=64, + aux_channels=80, + aux_context_window=2, + dropout=0., + bias=True, + use_weight_norm=True, + use_causal_conv=False, + upsample_scales=[4, 4, 4, 4], + nonlinear_activation=None, + nonlinear_activation_params={}, + interpolate_mode="nearest", + freq_axis_kernel_size=1): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.aux_channels = aux_channels + self.aux_context_window = aux_context_window + self.layers = layers + self.stacks = stacks + self.kernel_size = kernel_size + + assert layers % stacks == 0 + layers_per_stack = layers // stacks + + self.first_conv = nn.Conv1D( + in_channels, residual_channels, 1, bias_attr=True) + self.upsample_net = ConvInUpsampleNet( + upsample_scales=upsample_scales, + nonlinear_activation=nonlinear_activation, + nonlinear_activation_params=nonlinear_activation_params, + interpolate_mode=interpolate_mode, + freq_axis_kernel_size=freq_axis_kernel_size, + aux_channels=aux_channels, + aux_context_window=aux_context_window, + use_causal_conv=use_causal_conv) + self.upsample_factor = np.prod(upsample_scales) + + self.conv_layers = nn.LayerList() + for layer in range(layers): + dilation = 2**(layer % layers_per_stack) + conv = ResidualBlock( + kernel_size=kernel_size, + residual_channels=residual_channels, + gate_channels=gate_channels, + skip_channels=skip_channels, + aux_channels=aux_channels, + dilation=dilation, + dropout=dropout, + bias=bias, + use_causal_conv=use_causal_conv) + self.conv_layers.append(conv) + + self.last_conv_layers = nn.Sequential( + nn.ReLU(), + nn.Conv1D( + skip_channels, skip_channels, 1, bias_attr=True), + nn.ReLU(), + nn.Conv1D( + skip_channels, out_channels, 1, bias_attr=True)) + + if use_weight_norm: + self.apply_weight_norm() + + def forward(self, x, c): + if c is not None: + c = self.upsample_net(c) + assert c.shape[-1] == x.shape[-1] + + x = self.first_conv(x) + skips = 0 + for f in self.conv_layers: + x, s = f(x, c) + skips += s + skips *= math.sqrt(1.0 / len(self.conv_layers)) + + x = self.last_conv_layers(skips) + return x + + def apply_weight_norm(self): + def _apply_weight_norm(layer): + if isinstance(layer, (nn.Conv1D, nn.Conv2D)): + nn.utils.weight_norm(layer) + + self.apply(_apply_weight_norm) + + def remove_weight_norm(self): + def _remove_weight_norm(layer): + try: + nn.utils.remove_weight_norm(layer) + except ValueError: + pass + + self.apply(_remove_weight_norm) + + def inference(self, c=None, x=None): + """ + single instance inference + c: [T', C] condition + x: [T, 1] noise + """ + if x is not None: + x = paddle.transpose(x, [1, 0]).unsqueeze(0) # pseudo batch + else: + assert c is not None + x = paddle.randn([1, 1, c.shape[0] * self.upsample_factor]) + + if c is not None: + c = paddle.transpose(c, [1, 0]).unsqueeze(0) # pseudo batch + c = nn.Pad1D(self.aux_context_window, mode='edge')(c) + out = self.forward(x, c).squeeze(0).transpose([1, 0]) + return out + + +class PWGDiscriminator(nn.Layer): + def __init__(self, + in_channels=1, + out_channels=1, + kernel_size=3, + layers=10, + conv_channels=64, + dilation_factor=1, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + bias=True, + use_weight_norm=True): + super().__init__() + assert kernel_size % 2 == 1 + assert dilation_factor > 0 + conv_layers = [] + conv_in_channels = in_channels + for i in range(layers - 1): + if i == 0: + dilation = 1 + else: + dilation = i if dilation_factor == 1 else dilation_factor**i + conv_in_channels = conv_channels + padding = (kernel_size - 1) // 2 * dilation + conv_layer = nn.Conv1D( + conv_in_channels, + conv_channels, + kernel_size, + padding=padding, + dilation=dilation, + bias_attr=bias) + nonlinear = getattr( + nn, nonlinear_activation)(**nonlinear_activation_params) + conv_layers.append(conv_layer) + conv_layers.append(nonlinear) + padding = (kernel_size - 1) // 2 + last_conv = nn.Conv1D( + conv_in_channels, + out_channels, + kernel_size, + padding=padding, + bias_attr=bias) + conv_layers.append(last_conv) + self.conv_layers = nn.Sequential(*conv_layers) + + if use_weight_norm: + self.apply_weight_norm() + + def forward(self, x): + return self.conv_layers(x) + + def apply_weight_norm(self): + def _apply_weight_norm(layer): + if isinstance(layer, (nn.Conv1D, nn.Conv2D)): + nn.utils.weight_norm(layer) + + self.apply(_apply_weight_norm) + + def remove_weight_norm(self): + def _remove_weight_norm(layer): + try: + nn.utils.remove_weight_norm(layer) + except ValueError: + pass + + self.apply(_remove_weight_norm) + + +class ResidualPWGDiscriminator(nn.Layer): + def __init__(self, + in_channels=1, + out_channels=1, + kernel_size=3, + layers=30, + stacks=3, + residual_channels=64, + gate_channels=128, + skip_channels=64, + dropout=0., + bias=True, + use_weight_norm=True, + use_causal_conv=False, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}): + super().__init__() + assert kernel_size % 2 == 1 + self.in_channels = in_channels + self.out_channels = out_channels + self.layers = layers + self.stacks = stacks + self.kernel_size = kernel_size + + assert layers % stacks == 0 + layers_per_stack = layers // stacks + + self.first_conv = nn.Sequential( + nn.Conv1D( + in_channels, residual_channels, 1, bias_attr=True), + getattr(nn, nonlinear_activation)(**nonlinear_activation_params)) + + self.conv_layers = nn.LayerList() + for layer in range(layers): + dilation = 2**(layer % layers_per_stack) + conv = ResidualBlock( + kernel_size=kernel_size, + residual_channels=residual_channels, + gate_channels=gate_channels, + skip_channels=skip_channels, + aux_channels=None, # no auxiliary input + dropout=dropout, + dilation=dilation, + bias=bias, + use_causal_conv=use_causal_conv) + self.conv_layers.append(conv) + + self.last_conv_layers = nn.Sequential( + getattr(nn, nonlinear_activation)(**nonlinear_activation_params), + nn.Conv1D( + skip_channels, skip_channels, 1, bias_attr=True), + getattr(nn, nonlinear_activation)(**nonlinear_activation_params), + nn.Conv1D( + skip_channels, out_channels, 1, bias_attr=True)) + + if use_weight_norm: + self.apply_weight_norm() + + def forward(self, x): + x = self.first_conv(x) + skip = 0 + for f in self.conv_layers: + x, h = f(x, None) + skip += h + skip *= math.sqrt(1 / len(self.conv_layers)) + + x = skip + x = self.last_conv_layers(x) + return x + + def apply_weight_norm(self): + def _apply_weight_norm(layer): + if isinstance(layer, (nn.Conv1D, nn.Conv2D)): + nn.utils.weight_norm(layer) + + self.apply(_apply_weight_norm) + + def remove_weight_norm(self): + def _remove_weight_norm(layer): + try: + nn.utils.remove_weight_norm(layer) + except ValueError: + pass + + self.apply(_remove_weight_norm) diff --git a/parakeet/modules/stft_loss.py b/parakeet/modules/stft_loss.py new file mode 100644 index 0000000..6c5bc9f --- /dev/null +++ b/parakeet/modules/stft_loss.py @@ -0,0 +1,138 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import nn +from paddle.nn import functional as F + +from parakeet.modules.audio import STFT + + +class SpectralConvergenceLoss(nn.Layer): + """Spectral convergence loss module.""" + + def __init__(self): + """Initilize spectral convergence loss module.""" + super().__init__() + + def forward(self, x_mag, y_mag): + """Calculate forward propagation. + Args: + x_mag (Tensor): Magnitude spectrogram of predicted signal (B, C, T). + y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, C, T). + Returns: + Tensor: Spectral convergence loss value. + """ + return paddle.norm( + y_mag - x_mag, p="fro") / paddle.norm( + y_mag, p="fro") + + +class LogSTFTMagnitudeLoss(nn.Layer): + """Log STFT magnitude loss module.""" + + def __init__(self): + """Initilize los STFT magnitude loss module.""" + super().__init__() + + def forward(self, x_mag, y_mag): + """Calculate forward propagation. + Args: + x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + Returns: + Tensor: Log STFT magnitude loss value. + """ + return F.l1_loss(paddle.log(y_mag), paddle.log(x_mag)) + + +class STFTLoss(nn.Layer): + """STFT loss module.""" + + def __init__(self, + fft_size=1024, + shift_size=120, + win_length=600, + window="hann_window"): + """Initialize STFT loss module.""" + super().__init__() + self.fft_size = fft_size + self.shift_size = shift_size + self.win_length = win_length + self.stft = STFT( + n_fft=fft_size, + hop_length=shift_size, + win_length=win_length, + window=window) + self.spectral_convergence_loss = SpectralConvergenceLoss() + self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() + + def forward(self, x, y): + """Calculate forward propagation. + Args: + x (Tensor): Predicted signal (B, T). + y (Tensor): Groundtruth signal (B, T). + Returns: + Tensor: Spectral convergence loss value. + Tensor: Log STFT magnitude loss value. + """ + x_mag = self.stft.magnitude(x) + y_mag = self.stft.magnitude(y) + sc_loss = self.spectral_convergence_loss(x_mag, y_mag) + mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) + + return sc_loss, mag_loss + + +class MultiResolutionSTFTLoss(nn.Layer): + """Multi resolution STFT loss module.""" + + def __init__( + self, + fft_sizes=[1024, 2048, 512], + hop_sizes=[120, 240, 50], + win_lengths=[600, 1200, 240], + window="hann", ): + """Initialize Multi resolution STFT loss module. + Args: + fft_sizes (list): List of FFT sizes. + hop_sizes (list): List of hop sizes. + win_lengths (list): List of window lengths. + window (str): Window function type. + """ + super().__init__() + assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) + self.stft_losses = nn.LayerList() + for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): + self.stft_losses.append(STFTLoss(fs, ss, wl, window)) + + def forward(self, x, y): + """Calculate forward propagation. + Args: + x (Tensor): Predicted signal (B, T). + y (Tensor): Groundtruth signal (B, T). + Returns: + Tensor: Multi resolution spectral convergence loss value. + Tensor: Multi resolution log STFT magnitude loss value. + """ + sc_loss = 0.0 + mag_loss = 0.0 + for f in self.stft_losses: + sc_l, mag_l = f(x, y) + sc_loss += sc_l + mag_loss += mag_l + sc_loss /= len(self.stft_losses) + mag_loss /= len(self.stft_losses) + + return sc_loss, mag_loss From 6a8b3f92dff2e17b72474a70b3d52f260be27df3 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 10 Jun 2021 16:26:09 +0800 Subject: [PATCH 09/47] add AudioDataset and MelDataset --- parakeet/datasets/audio_dataset.py | 133 +++++++++++++++++++++++++++++ parakeet/datasets/mel_dataset.py | 132 ++++++++++++++++++++++++++++ 2 files changed, 265 insertions(+) create mode 100644 parakeet/datasets/audio_dataset.py create mode 100644 parakeet/datasets/mel_dataset.py diff --git a/parakeet/datasets/audio_dataset.py b/parakeet/datasets/audio_dataset.py new file mode 100644 index 0000000..9f6c51a --- /dev/null +++ b/parakeet/datasets/audio_dataset.py @@ -0,0 +1,133 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union, Optional, Callable, Tuple +from pathlib import Path +from multiprocessing import Manager + +import numpy as np +from paddle.io import Dataset +import logging + + +class AudioDataset(Dataset): + """Dataset to load audio. + + Parameters + ---------- + root_dir : Union[Path, str] + The root of the dataset. + audio_pattern : str + A pattern to recursively find all audio files, by default "*-wave.npy" + audio_length_threshold : int, optional + The minmimal length(number of samples) of the audio, by default None + audio_load_fn : Callable, optional + Function to load the audio, which takes a Path object or str as input, + by default np.load + return_utt_id : bool, optional + Whether to include utterance indentifier in the return value of + __getitem__, by default False + use_cache : bool, optional + Whether to cache seen examples while reading, by default False + """ + + def __init__( + self, + root_dir: Union[Path, str], + audio_pattern: str="*-wave.npy", + audio_length_threshold: Optional[int]=None, + audio_load_fn: Callable=np.load, + return_utt_id: bool=False, + use_cache: bool=False, ): + # allow str and Path that contains '~' + root_dir = Path(root_dir).expanduser() + + # recursively find all of audio files that match thr pattern + audio_files = sorted(list(root_dir.rglob(audio_pattern))) + + # filter by threshold + if audio_length_threshold is not None: + audio_lengths = [audio_load_fn(f).shape[0] for f in audio_files] + idxs = [ + idx for idx in range(len(audio_files)) + if audio_lengths[idx] > audio_length_threshold + ] + if len(audio_files) != len(idxs): + logging.warning( + f"some files are filtered by audio length threshold " + f"({len(audio_files)} -> {len(idxs)}).") + audio_files = [audio_files[idx] for idx in idxs] + + # assert the number of files + assert len( + audio_files) != 0, f"Not any audio files found in {root_dir}." + + self.audio_files = audio_files + self.audio_load_fn = audio_load_fn + self.return_utt_id = return_utt_id + # TODO(chenfeiyu): better strategy to get utterance id + if ".npy" in audio_pattern: + self.utt_ids = [ + f.name.replace("-wave.npy", "") for f in audio_files + ] + else: + self.utt_ids = [f.stem for f in audio_files] + self.use_cache = use_cache + if use_cache: + # use manager to share object between multiple processes + # avoid per-reader process caching + self.manager = Manager() + self.caches = self.manager.list() + self.caches += [None for _ in range(len(audio_files))] + + def __getitem__(self, idx: int) -> Tuple[str, np.ndarray]: + """Get an example given the index. + + Parameters + ---------- + idx : int + The index. + + Returns + ------- + utt_id : str + Utterance identifier. + audio : np.ndarray + Shape (n_samples, ), the audio. + """ + if self.use_cache and self.caches[idx] is not None: + return self.caches[idx] + + utt_id = self.utt_ids[idx] + audio = self.audio_load_fn(self.audio_files[idx]) + + if self.return_utt_id: + items = utt_id, audio + else: + items = audio + + if self.use_cache: + self.caches[idx] = items + + return items + + def __len__(self) -> int: + """Returns the size of the dataset. + + Returns + ------- + int + The length of the dataset + """ + return len(self.audio_files) diff --git a/parakeet/datasets/mel_dataset.py b/parakeet/datasets/mel_dataset.py new file mode 100644 index 0000000..038654c --- /dev/null +++ b/parakeet/datasets/mel_dataset.py @@ -0,0 +1,132 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union, Optional, Callable, Tuple +from pathlib import Path +from multiprocessing import Manager + +import numpy as np +from paddle.io import Dataset +import logging + + +class MelDataset(Dataset): + """Dataset to load mel-spectrograms. + + Parameters + ---------- + root_dir : Union[Path, str] + The root of the dataset. + mel_pattern : str, optional + A pattern to recursively find all mel feature files, by default + "*-feats.npy" + mel_length_threshold : Optional[int], optional + The minmimal length(number of frames) of the audio, by default None + mel_load_fn : Callable, optional + Function to load the audio, which takes a Path object or str as input, + by default np.load + return_utt_id : bool, optional + Whether to include utterance indentifier in the return value of + __getitem__, by default False + use_cahce : bool, optional + Whether to cache seen examples while reading, by default False + """ + + def __init__( + self, + root_dir: Union[Path, str], + mel_pattern: str="*-feats.npy", + mel_length_threshold: Optional[int]=None, + mel_load_fn: Callable=np.load, + return_utt_id: bool=False, + use_cahce: bool=False, ): + # allow str and Path that contains '~' + root_dir = Path(root_dir).expanduser() + + # find all of the mel files + mel_files = sorted(list(root_dir.rglob(mel_pattern))) + + # filter by threshold + if mel_length_threshold is not None: + mel_lengths = [mel_load_fn(f).shape[1] for f in mel_files] + idxs = [ + idx for idx in range(len(mel_files)) + if mel_lengths[idx] > mel_length_threshold + ] + if len(mel_files) != len(idxs): + logging.warning( + f"Some files are filtered by mel length threshold " + f"({len(mel_files)} -> {len(idxs)}).") + mel_files = [mel_files[idx] for idx in idxs] + + # assert the number of files + assert len(mel_files) != 0, f"Not found any mel files in {root_dir}." + + self.mel_files = mel_files + self.mel_load_fn = mel_load_fn + + # TODO(chenfeiyu): better strategy to get utterance id + if ".npy" in mel_pattern: + self.utt_ids = [ + f.name.replace("-feats.npy", "") for f in mel_files + ] + else: + self.utt_ids = [f.stem for f in mel_files] + self.return_utt_id = return_utt_id + self.use_cache = use_cahce + if use_cahce: + self.manager = Manager() + self.caches = self.manager.list() + self.caches += [None for _ in range(len(mel_files))] + + def __getitem__(self, idx): + """Get an example given the index. + + Parameters + ---------- + idx : int + The index + + Returns + ------- + utt_id : str + Utterance identifier. + audio : np.ndarray + Shape (n_mels, n_frames), the mel spectrogram. + """ + if self.use_cache and self.caches[idx] is not None: + return self.caches[idx] + + utt_id = self.utt_ids[idx] + mel = self.mel_load_fn(self.mel_files[idx]) + + if self.return_utt_id: + items = utt_id, mel + else: + items = mel + + if self.use_cache: + self.caches[idx] = items + + return items + + def __len__(self): + """Returns the size of the dataset. + + Returns + ------- + int + The length of the dataset + """ + return len(self.mel_files) From 988d6d326868cec8a39455756cab57b962d97f5f Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 10 Jun 2021 17:58:25 +0800 Subject: [PATCH 10/47] add h5py utility to interact with numpy --- parakeet/utils/h5_utils.py | 104 +++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 parakeet/utils/h5_utils.py diff --git a/parakeet/utils/h5_utils.py b/parakeet/utils/h5_utils.py new file mode 100644 index 0000000..229fe8d --- /dev/null +++ b/parakeet/utils/h5_utils.py @@ -0,0 +1,104 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Union, Any +import sys +import logging +import h5py +import numpy as np + + +def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any: + """Read a dataset from a HDF5 file. + + Parameters + ---------- + filename : Union[Path, str] + Path of the HDF5 file. + dataset_name : str + Name of the dataset to read. + + Returns + ------- + Any + The retrieved dataset. + """ + filename = Path(filename) + + if not filename.exists(): + logging.error(f"There is no such a hdf5 file ({filename}).") + sys.exit(1) + + hdf5_file = h5py.File(filename, "r") + + if dataset_name not in hdf5_file: + logging.error( + f"There is no such a data in hdf5 file. ({dataset_name})") + sys.exit(1) + + hdf5_data = hdf5_file[dataset_name][()] # a special syntax of h5py + hdf5_file.close() + + return hdf5_data + + +def write_hdf5(filename: Union[Path, str], + dataset_name: str, + write_data: np.ndarrays, + is_overwrite: bool=True) -> None: + """Write dataset to HDF5 file. + + Parameters + ---------- + filename : Union[Path, str] + Path of the HDF5 file. + dataset_name : str + Name of the dataset to write to. + write_data : np.ndarrays + The data to write. + is_overwrite : bool, optional + Whether to overwrite, by default True + """ + # convert to numpy array + filename = Path(filename) + write_data = np.array(write_data) + + # check folder existence + filename.parent.mkdir(parents=True, exist_ok=True) + + # check hdf5 existence + if filename.exists(): + # if already exists, open with r+ mode + hdf5_file = h5py.File(filename, "r+") + # check dataset existence + if dataset_name in hdf5_file: + if is_overwrite: + logging.warning("Dataset in hdf5 file already exists. " + "recreate dataset in hdf5.") + hdf5_file.__delitem__(dataset_name) + else: + logging.error( + "Dataset in hdf5 file already exists. " + "if you want to overwrite, please set is_overwrite = True.") + hdf5_file.close() + sys.exit(1) + else: + # if not exists, open with w mode + hdf5_file = h5py.File(filename, "w") + + # write data to hdf5 + hdf5_file.create_dataset(dataset_name, data=write_data) + hdf5_file.flush() + hdf5_file.close() From 60c16dcfb7ed6754a576e96c7b6ac837fdbed5cd Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 10 Jun 2021 20:16:14 +0800 Subject: [PATCH 11/47] add AudioMelDataset for training vocoders. --- parakeet/datasets/audio_mel_dataset.py | 161 +++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 parakeet/datasets/audio_mel_dataset.py diff --git a/parakeet/datasets/audio_mel_dataset.py b/parakeet/datasets/audio_mel_dataset.py new file mode 100644 index 0000000..c3fd00f --- /dev/null +++ b/parakeet/datasets/audio_mel_dataset.py @@ -0,0 +1,161 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union, Optional, Callable, Tuple, Any +from pathlib import Path +from multiprocessing import Manager + +import numpy as np +from paddle.io import Dataset +import logging + + +class AudioMelDataset(Dataset): + """Dataset to laod audio and mel dataset. + + Parameters + ---------- + root_dir : Union[Path, str] + The root of the dataset. + audio_pattern : str, optional + A pattern to recursively find all audio files, by default + "*-wave.npy" + mel_pattern : str, optional + A pattern to recursively find all mel feature files, by default + "*-mel.npy" + audio_load_fn : Callable, optional + Function to load the audio, which takes a Path object or str as + input, by default np.load + mel_load_fn : Callable, optional + Function to load the mel features, which takes a Path object or + str as input, by default np.load + audio_length_threshold : Optional[int], optional + The minmimal length(number of samples) of the audio, by default None + mel_length_threshold : Optional[int], optional + The minmimal length(number of frames) of the audio, by default None + return_utt_id : bool, optional + Whether to include utterance indentifier in the return value of + __getitem__, by default False + use_cache : bool, optional + Whether to cache seen examples while reading, by default False + """ + + def __init__(self, + root_dir: Union[Path, str], + audio_pattern: str="*-wave.npy", + mel_pattern: str="*-mel.npy", + audio_load_fn: Callable=np.load, + mel_load_fn: Callable=np.load, + audio_length_threshold: Optional[int]=None, + mel_length_threshold: Optional[int]=None, + return_utt_id: bool=False, + use_cache: bool=False): + root_dir = Path(root_dir).expanduser() + # find all of audio and mel files + audio_files = sorted(list(root_dir.rglob(audio_pattern))) + mel_files = sorted(list(root_dir.rglob(mel_pattern))) + + # filter by threshold + if audio_length_threshold is not None: + audio_lengths = [audio_load_fn(f).shape[0] for f in audio_files] + idxs = [ + idx for idx in range(len(audio_files)) + if audio_lengths[idx] > audio_length_threshold + ] + if len(audio_files) != len(idxs): + logging.warning( + f"Some files are filtered by audio length threshold " + f"({len(audio_files)} -> {len(idxs)}).") + audio_files = [audio_files[idx] for idx in idxs] + mel_files = [mel_files[idx] for idx in idxs] + if mel_length_threshold is not None: + mel_lengths = [mel_load_fn(f).shape[1] for f in mel_files] + idxs = [ + idx for idx in range(len(mel_files)) + if mel_lengths[idx] > mel_length_threshold + ] + if len(mel_files) != len(idxs): + logging.warning( + f"Some files are filtered by mel length threshold " + f"({len(mel_files)} -> {len(idxs)}).") + audio_files = [audio_files[idx] for idx in idxs] + mel_files = [mel_files[idx] for idx in idxs] + + # assert the number of files + assert len( + audio_files) != 0, f"Not found any audio files in {root_dir}." + assert len(audio_files) == len(mel_files), \ + (f"Number of audio and mel files are different " + f"({len(audio_files)} vs {len(mel_files)}).") + + self.audio_files = audio_files + self.audio_load_fn = audio_load_fn + self.mel_load_fn = mel_load_fn + self.mel_files = mel_files + if ".npy" in audio_pattern: + self.utt_ids = [ + f.name.replace("-wave.npy", "") for f in audio_files + ] + else: + self.utt_ids = [f.stem for f in audio_files] + self.return_utt_id = return_utt_id + self.use_cache = use_cache + if use_cache: + self.manager = Manager() + self.caches = self.manager.list() + self.caches += [None for _ in range(len(audio_files))] + + def __getitem__(self, idx: int) -> Tuple: + """Get an example given the index. + + Parameters + ---------- + idx : int + The index of the example. + + Returns + ------- + utt_id : str + Utterance identifier. + audio : np.ndarray + Shape (n_samples, ), the audio. + mel: np.ndarray + Shape (n_mels, n_frames), the mel spectrogram. + """ + if self.use_cache and self.caches[idx] is not None: + return self.caches[idx] + + utt_id = self.utt_ids[idx] + audio = self.audio_load_fn(self.audio_files[idx]) + mel = self.mel_load_fn(self.mel_files[idx]) + + if self.return_utt_id: + items = utt_id, audio, mel + else: + items = audio, mel + + if self.use_cache: + self.caches[idx] = items + + return items + + def __len__(self): + """Returns the size of the dataset. + + Returns + ------- + int + The length of the dataset + """ + return len(self.audio_files) From 3bf2e7173487b29e0685dd5b68de28272d341fba Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 10 Jun 2021 20:34:59 +0800 Subject: [PATCH 12/47] checkpoint utility: add support for str in addition to Path object --- parakeet/training/checkpoint.py | 7 ++++--- tests/test_checkpoint.py | 8 ++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/parakeet/training/checkpoint.py b/parakeet/training/checkpoint.py index 4bb12e2..bbbbdc0 100644 --- a/parakeet/training/checkpoint.py +++ b/parakeet/training/checkpoint.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Callable, Mapping, List +from typing import Callable, Mapping, List, Union +import os from pathlib import Path @@ -58,8 +59,8 @@ class KBest(object): def __init__(self, max_size: int=5, - save_fn: Callable[[Path], None]=None, - del_fn: Callable[[Path], None]=lambda f: f.unlink()): + save_fn: Callable[[Union[Path, str]], None]=None, + del_fn: Callable[[Union[Path, str]], None]=os.remove): self.best_records: Mapping[Path, float] = {} self.save_fn = save_fn self.del_fn = del_fn diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py index 9173033..120115f 100644 --- a/tests/test_checkpoint.py +++ b/tests/test_checkpoint.py @@ -27,13 +27,15 @@ def test_kbest(): K = 1 kbest_manager = KBest(max_size=K, save_fn=save_fn) checkpoint_dir = Path("checkpoints") - shutil.rmtree(checkpoint_dir) + if checkpoint_dir.exists(): + shutil.rmtree(checkpoint_dir) checkpoint_dir.mkdir(parents=True) a = np.random.rand(20) for i, score in enumerate(a): path = checkpoint_dir / f"step_{i}" kbest_manager.add_checkpoint(score, path) assert len(list(checkpoint_dir.glob("step_*"))) == K + shutil.rmtree(checkpoint_dir) def test_klatest(): @@ -44,9 +46,11 @@ def test_klatest(): K = 5 klatest_manager = KLatest(max_size=K, save_fn=save_fn) checkpoint_dir = Path("checkpoints") - shutil.rmtree(checkpoint_dir) + if checkpoint_dir.exists(): + shutil.rmtree(checkpoint_dir) checkpoint_dir.mkdir(parents=True) for i in range(20): path = checkpoint_dir / f"step_{i}" klatest_manager.add_checkpoint(path) assert len(list(checkpoint_dir.glob("step_*"))) == K + shutil.rmtree(checkpoint_dir) From 13ab0bd608fe427feeb479bf4058d1b6ce7fd7c4 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 10 Jun 2021 22:57:21 +0800 Subject: [PATCH 13/47] remove task-specific Datasets and add a general purpose DataTable --- parakeet/datasets/audio_dataset.py | 133 -------------------- parakeet/datasets/audio_mel_dataset.py | 161 ------------------------- parakeet/datasets/data_tabel.py | 150 +++++++++++++++++++++++ parakeet/datasets/mel_dataset.py | 132 -------------------- tests/test_data_table.py | 22 ++++ 5 files changed, 172 insertions(+), 426 deletions(-) delete mode 100644 parakeet/datasets/audio_dataset.py delete mode 100644 parakeet/datasets/audio_mel_dataset.py create mode 100644 parakeet/datasets/data_tabel.py delete mode 100644 parakeet/datasets/mel_dataset.py create mode 100644 tests/test_data_table.py diff --git a/parakeet/datasets/audio_dataset.py b/parakeet/datasets/audio_dataset.py deleted file mode 100644 index 9f6c51a..0000000 --- a/parakeet/datasets/audio_dataset.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Union, Optional, Callable, Tuple -from pathlib import Path -from multiprocessing import Manager - -import numpy as np -from paddle.io import Dataset -import logging - - -class AudioDataset(Dataset): - """Dataset to load audio. - - Parameters - ---------- - root_dir : Union[Path, str] - The root of the dataset. - audio_pattern : str - A pattern to recursively find all audio files, by default "*-wave.npy" - audio_length_threshold : int, optional - The minmimal length(number of samples) of the audio, by default None - audio_load_fn : Callable, optional - Function to load the audio, which takes a Path object or str as input, - by default np.load - return_utt_id : bool, optional - Whether to include utterance indentifier in the return value of - __getitem__, by default False - use_cache : bool, optional - Whether to cache seen examples while reading, by default False - """ - - def __init__( - self, - root_dir: Union[Path, str], - audio_pattern: str="*-wave.npy", - audio_length_threshold: Optional[int]=None, - audio_load_fn: Callable=np.load, - return_utt_id: bool=False, - use_cache: bool=False, ): - # allow str and Path that contains '~' - root_dir = Path(root_dir).expanduser() - - # recursively find all of audio files that match thr pattern - audio_files = sorted(list(root_dir.rglob(audio_pattern))) - - # filter by threshold - if audio_length_threshold is not None: - audio_lengths = [audio_load_fn(f).shape[0] for f in audio_files] - idxs = [ - idx for idx in range(len(audio_files)) - if audio_lengths[idx] > audio_length_threshold - ] - if len(audio_files) != len(idxs): - logging.warning( - f"some files are filtered by audio length threshold " - f"({len(audio_files)} -> {len(idxs)}).") - audio_files = [audio_files[idx] for idx in idxs] - - # assert the number of files - assert len( - audio_files) != 0, f"Not any audio files found in {root_dir}." - - self.audio_files = audio_files - self.audio_load_fn = audio_load_fn - self.return_utt_id = return_utt_id - # TODO(chenfeiyu): better strategy to get utterance id - if ".npy" in audio_pattern: - self.utt_ids = [ - f.name.replace("-wave.npy", "") for f in audio_files - ] - else: - self.utt_ids = [f.stem for f in audio_files] - self.use_cache = use_cache - if use_cache: - # use manager to share object between multiple processes - # avoid per-reader process caching - self.manager = Manager() - self.caches = self.manager.list() - self.caches += [None for _ in range(len(audio_files))] - - def __getitem__(self, idx: int) -> Tuple[str, np.ndarray]: - """Get an example given the index. - - Parameters - ---------- - idx : int - The index. - - Returns - ------- - utt_id : str - Utterance identifier. - audio : np.ndarray - Shape (n_samples, ), the audio. - """ - if self.use_cache and self.caches[idx] is not None: - return self.caches[idx] - - utt_id = self.utt_ids[idx] - audio = self.audio_load_fn(self.audio_files[idx]) - - if self.return_utt_id: - items = utt_id, audio - else: - items = audio - - if self.use_cache: - self.caches[idx] = items - - return items - - def __len__(self) -> int: - """Returns the size of the dataset. - - Returns - ------- - int - The length of the dataset - """ - return len(self.audio_files) diff --git a/parakeet/datasets/audio_mel_dataset.py b/parakeet/datasets/audio_mel_dataset.py deleted file mode 100644 index c3fd00f..0000000 --- a/parakeet/datasets/audio_mel_dataset.py +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Union, Optional, Callable, Tuple, Any -from pathlib import Path -from multiprocessing import Manager - -import numpy as np -from paddle.io import Dataset -import logging - - -class AudioMelDataset(Dataset): - """Dataset to laod audio and mel dataset. - - Parameters - ---------- - root_dir : Union[Path, str] - The root of the dataset. - audio_pattern : str, optional - A pattern to recursively find all audio files, by default - "*-wave.npy" - mel_pattern : str, optional - A pattern to recursively find all mel feature files, by default - "*-mel.npy" - audio_load_fn : Callable, optional - Function to load the audio, which takes a Path object or str as - input, by default np.load - mel_load_fn : Callable, optional - Function to load the mel features, which takes a Path object or - str as input, by default np.load - audio_length_threshold : Optional[int], optional - The minmimal length(number of samples) of the audio, by default None - mel_length_threshold : Optional[int], optional - The minmimal length(number of frames) of the audio, by default None - return_utt_id : bool, optional - Whether to include utterance indentifier in the return value of - __getitem__, by default False - use_cache : bool, optional - Whether to cache seen examples while reading, by default False - """ - - def __init__(self, - root_dir: Union[Path, str], - audio_pattern: str="*-wave.npy", - mel_pattern: str="*-mel.npy", - audio_load_fn: Callable=np.load, - mel_load_fn: Callable=np.load, - audio_length_threshold: Optional[int]=None, - mel_length_threshold: Optional[int]=None, - return_utt_id: bool=False, - use_cache: bool=False): - root_dir = Path(root_dir).expanduser() - # find all of audio and mel files - audio_files = sorted(list(root_dir.rglob(audio_pattern))) - mel_files = sorted(list(root_dir.rglob(mel_pattern))) - - # filter by threshold - if audio_length_threshold is not None: - audio_lengths = [audio_load_fn(f).shape[0] for f in audio_files] - idxs = [ - idx for idx in range(len(audio_files)) - if audio_lengths[idx] > audio_length_threshold - ] - if len(audio_files) != len(idxs): - logging.warning( - f"Some files are filtered by audio length threshold " - f"({len(audio_files)} -> {len(idxs)}).") - audio_files = [audio_files[idx] for idx in idxs] - mel_files = [mel_files[idx] for idx in idxs] - if mel_length_threshold is not None: - mel_lengths = [mel_load_fn(f).shape[1] for f in mel_files] - idxs = [ - idx for idx in range(len(mel_files)) - if mel_lengths[idx] > mel_length_threshold - ] - if len(mel_files) != len(idxs): - logging.warning( - f"Some files are filtered by mel length threshold " - f"({len(mel_files)} -> {len(idxs)}).") - audio_files = [audio_files[idx] for idx in idxs] - mel_files = [mel_files[idx] for idx in idxs] - - # assert the number of files - assert len( - audio_files) != 0, f"Not found any audio files in {root_dir}." - assert len(audio_files) == len(mel_files), \ - (f"Number of audio and mel files are different " - f"({len(audio_files)} vs {len(mel_files)}).") - - self.audio_files = audio_files - self.audio_load_fn = audio_load_fn - self.mel_load_fn = mel_load_fn - self.mel_files = mel_files - if ".npy" in audio_pattern: - self.utt_ids = [ - f.name.replace("-wave.npy", "") for f in audio_files - ] - else: - self.utt_ids = [f.stem for f in audio_files] - self.return_utt_id = return_utt_id - self.use_cache = use_cache - if use_cache: - self.manager = Manager() - self.caches = self.manager.list() - self.caches += [None for _ in range(len(audio_files))] - - def __getitem__(self, idx: int) -> Tuple: - """Get an example given the index. - - Parameters - ---------- - idx : int - The index of the example. - - Returns - ------- - utt_id : str - Utterance identifier. - audio : np.ndarray - Shape (n_samples, ), the audio. - mel: np.ndarray - Shape (n_mels, n_frames), the mel spectrogram. - """ - if self.use_cache and self.caches[idx] is not None: - return self.caches[idx] - - utt_id = self.utt_ids[idx] - audio = self.audio_load_fn(self.audio_files[idx]) - mel = self.mel_load_fn(self.mel_files[idx]) - - if self.return_utt_id: - items = utt_id, audio, mel - else: - items = audio, mel - - if self.use_cache: - self.caches[idx] = items - - return items - - def __len__(self): - """Returns the size of the dataset. - - Returns - ------- - int - The length of the dataset - """ - return len(self.audio_files) diff --git a/parakeet/datasets/data_tabel.py b/parakeet/datasets/data_tabel.py new file mode 100644 index 0000000..75d075d --- /dev/null +++ b/parakeet/datasets/data_tabel.py @@ -0,0 +1,150 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union, Optional, Callable, Tuple, List, Dict, Any +from pathlib import Path +from multiprocessing import Manager + +import numpy as np +from paddle.io import Dataset + + +class DataTable(Dataset): + def __init__(self, + data: List[Dict[str, Any]], + fields: List[str]=None, + converters: Dict[str, Callable]=None, + use_cache: bool=False): + """Dataset to load and convert data for general purpose. + + Parameters + ---------- + data : List[Dict[str, Any]] + Metadata, a list of meta datum, each of which is composed of + several fields + fields : List[str], optional + Fields to use, if not specified, all the fields in the data are + used, by default None + converters : Dict[str, Callable], optional + Converters used to process each field, by default None + use_cache : bool, optional + Whether to use cache, by default False + + Raises + ------ + ValueError + If there is some field that does not exist in data. + ValueError + If there is some field in converters that does not exist in fields. + """ + # metadata + self.data = data + assert len(data) > 0, "This dataset has no examples" + + # peak an example to get existing fields. + first_example = self.data[0] + fields_in_data = first_example.keys() + + # check all the requested fields exist + if fields is None: + self.fields = fields_in_data + else: + for field in fields: + if field not in fields_in_data: + raise ValueError( + f"The requested field ({field}) is not found" + f"in the data. Fields in the data is {fields_in_data}") + self.fields = fields + + # check converters + if converters is None: + self.converters = {} + else: + for field in converters.keys(): + if field not in self.fields: + raise ValueError( + f"The converter has a non existing field ({field})") + self.converters = converters + + self.use_cache = use_cache + if use_cache: + self._initialize_cache() + + def _initialize_cache(self): + self.manager = Manager() + self.caches = self.manager.list() + self.caches += [None for _ in range(len(self))] + + def _get_metadata(self, idx: int) -> Dict[str, Any]: + """Return a meta-datum given an index.""" + return self.data[idx] + + def _convert(self, meta_datum: Dict[str, Any]) -> Dict[str, Any]: + """Convert a meta datum to an example by applying the corresponding + converters to each fields requested. + + Parameters + ---------- + meta_datum : Dict[str, Any] + Meta datum + + Returns + ------- + Dict[str, Any] + Converted example + """ + example = {} + for field in self.fields: + converter = self.converters.get(field, None) + meta_datum_field = meta_datum[field] + if converter is not None: + converted_field = converter(meta_datum_field) + else: + converted_field = meta_datum_field + example[field] = converted_field + return example + + def __getitem__(self, idx: int) -> Dict[str, Any]: + """Get an example given an index. + + Parameters + ---------- + idx : int + Index of the example to get + + Returns + ------- + Dict[str, Any] + A converted example + """ + if self.use_cache and self.caches[idx] is not None: + return self.caches[idx] + + meta_datum = self._get_metadata(idx) + example = self._convert(meta_datum) + + if self.use_cache: + self.caches[idx] = example + + return example + + def __len__(self) -> int: + """Returns the size of the dataset. + + Returns + ------- + int + The length of the dataset + """ + return len(self.data) diff --git a/parakeet/datasets/mel_dataset.py b/parakeet/datasets/mel_dataset.py deleted file mode 100644 index 038654c..0000000 --- a/parakeet/datasets/mel_dataset.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Union, Optional, Callable, Tuple -from pathlib import Path -from multiprocessing import Manager - -import numpy as np -from paddle.io import Dataset -import logging - - -class MelDataset(Dataset): - """Dataset to load mel-spectrograms. - - Parameters - ---------- - root_dir : Union[Path, str] - The root of the dataset. - mel_pattern : str, optional - A pattern to recursively find all mel feature files, by default - "*-feats.npy" - mel_length_threshold : Optional[int], optional - The minmimal length(number of frames) of the audio, by default None - mel_load_fn : Callable, optional - Function to load the audio, which takes a Path object or str as input, - by default np.load - return_utt_id : bool, optional - Whether to include utterance indentifier in the return value of - __getitem__, by default False - use_cahce : bool, optional - Whether to cache seen examples while reading, by default False - """ - - def __init__( - self, - root_dir: Union[Path, str], - mel_pattern: str="*-feats.npy", - mel_length_threshold: Optional[int]=None, - mel_load_fn: Callable=np.load, - return_utt_id: bool=False, - use_cahce: bool=False, ): - # allow str and Path that contains '~' - root_dir = Path(root_dir).expanduser() - - # find all of the mel files - mel_files = sorted(list(root_dir.rglob(mel_pattern))) - - # filter by threshold - if mel_length_threshold is not None: - mel_lengths = [mel_load_fn(f).shape[1] for f in mel_files] - idxs = [ - idx for idx in range(len(mel_files)) - if mel_lengths[idx] > mel_length_threshold - ] - if len(mel_files) != len(idxs): - logging.warning( - f"Some files are filtered by mel length threshold " - f"({len(mel_files)} -> {len(idxs)}).") - mel_files = [mel_files[idx] for idx in idxs] - - # assert the number of files - assert len(mel_files) != 0, f"Not found any mel files in {root_dir}." - - self.mel_files = mel_files - self.mel_load_fn = mel_load_fn - - # TODO(chenfeiyu): better strategy to get utterance id - if ".npy" in mel_pattern: - self.utt_ids = [ - f.name.replace("-feats.npy", "") for f in mel_files - ] - else: - self.utt_ids = [f.stem for f in mel_files] - self.return_utt_id = return_utt_id - self.use_cache = use_cahce - if use_cahce: - self.manager = Manager() - self.caches = self.manager.list() - self.caches += [None for _ in range(len(mel_files))] - - def __getitem__(self, idx): - """Get an example given the index. - - Parameters - ---------- - idx : int - The index - - Returns - ------- - utt_id : str - Utterance identifier. - audio : np.ndarray - Shape (n_mels, n_frames), the mel spectrogram. - """ - if self.use_cache and self.caches[idx] is not None: - return self.caches[idx] - - utt_id = self.utt_ids[idx] - mel = self.mel_load_fn(self.mel_files[idx]) - - if self.return_utt_id: - items = utt_id, mel - else: - items = mel - - if self.use_cache: - self.caches[idx] = items - - return items - - def __len__(self): - """Returns the size of the dataset. - - Returns - ------- - int - The length of the dataset - """ - return len(self.mel_files) diff --git a/tests/test_data_table.py b/tests/test_data_table.py new file mode 100644 index 0000000..aca0605 --- /dev/null +++ b/tests/test_data_table.py @@ -0,0 +1,22 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from parakeet.datasets.data_tabel import DataTable + + +def test_audio_dataset(): + metadata = [{'name': 'Sonic', 'v': 1000}, {'name': 'Prestol', 'v': 2000}] + converters = {'v': lambda x: x / 1000} + dataset = DataTable(metadata, fields=['v'], converters=converters) + assert dataset[0] == {'v': 1.0} From f0a5ac8c5a1e7e0037da276574c864caad32b898 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Fri, 11 Jun 2021 00:59:07 +0800 Subject: [PATCH 14/47] move docstring to class --- parakeet/datasets/data_tabel.py | 45 +++++++++++++++++---------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/parakeet/datasets/data_tabel.py b/parakeet/datasets/data_tabel.py index 75d075d..78a3608 100644 --- a/parakeet/datasets/data_tabel.py +++ b/parakeet/datasets/data_tabel.py @@ -21,33 +21,34 @@ from paddle.io import Dataset class DataTable(Dataset): + """Dataset to load and convert data for general purpose. + + Parameters + ---------- + data : List[Dict[str, Any]] + Metadata, a list of meta datum, each of which is composed of + several fields + fields : List[str], optional + Fields to use, if not specified, all the fields in the data are + used, by default None + converters : Dict[str, Callable], optional + Converters used to process each field, by default None + use_cache : bool, optional + Whether to use cache, by default False + + Raises + ------ + ValueError + If there is some field that does not exist in data. + ValueError + If there is some field in converters that does not exist in fields. + """ + def __init__(self, data: List[Dict[str, Any]], fields: List[str]=None, converters: Dict[str, Callable]=None, use_cache: bool=False): - """Dataset to load and convert data for general purpose. - - Parameters - ---------- - data : List[Dict[str, Any]] - Metadata, a list of meta datum, each of which is composed of - several fields - fields : List[str], optional - Fields to use, if not specified, all the fields in the data are - used, by default None - converters : Dict[str, Callable], optional - Converters used to process each field, by default None - use_cache : bool, optional - Whether to use cache, by default False - - Raises - ------ - ValueError - If there is some field that does not exist in data. - ValueError - If there is some field in converters that does not exist in fields. - """ # metadata self.data = data assert len(data) > 0, "This dataset has no examples" From 258083aea9862980635eb7708825c176ad201636 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Fri, 11 Jun 2021 01:01:43 +0800 Subject: [PATCH 15/47] fix file names --- .pre-commit-config.yaml | 7 ++++--- parakeet/datasets/{data_tabel.py => data_table.py} | 0 2 files changed, 4 insertions(+), 3 deletions(-) rename parakeet/datasets/{data_tabel.py => data_table.py} (100%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9d6da44..6f222bb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,10 +1,11 @@ +repos: - repo: https://github.com/PaddlePaddle/mirrors-yapf.git - sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37 + rev: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37 hooks: - id: yapf files: \.py$ - repo: https://github.com/pre-commit/pre-commit-hooks - sha: a11d9314b22d8f8c7556443875b731ef05965464 + rev: a11d9314b22d8f8c7556443875b731ef05965464 hooks: - id: check-merge-conflict - id: check-symlinks @@ -15,7 +16,7 @@ - id: trailing-whitespace files: \.md$ - repo: https://github.com/Lucas-C/pre-commit-hooks - sha: v1.0.1 + rev: v1.0.1 hooks: - id: forbid-crlf files: \.md$ diff --git a/parakeet/datasets/data_tabel.py b/parakeet/datasets/data_table.py similarity index 100% rename from parakeet/datasets/data_tabel.py rename to parakeet/datasets/data_table.py From 66062d29e585426aeadbc208f0b65f9017af4539 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Fri, 11 Jun 2021 02:43:10 +0800 Subject: [PATCH 16/47] WIP: add sample code for parallel wavegan --- parakeet/models/parallel_wavegan.py | 360 +++++++++++++++++++++++----- 1 file changed, 300 insertions(+), 60 deletions(-) diff --git a/parakeet/models/parallel_wavegan.py b/parakeet/models/parallel_wavegan.py index 3ed6058..aded823 100644 --- a/parakeet/models/parallel_wavegan.py +++ b/parakeet/models/parallel_wavegan.py @@ -13,33 +13,88 @@ # limitations under the License. import math +from typing import List, Dict, Any, Union, Optional + import numpy as np import paddle +from paddle import Tensor from paddle import nn from paddle.nn import functional as F class Stretch2D(nn.Layer): - def __init__(self, x_scale, y_scale, mode="nearest"): + def __init__(self, w_scale: int, h_scale: int, mode: str="nearest"): + """Strech an image (or image-like object) with some interpolation. + + Parameters + ---------- + w_scale : int + Scalar of width. + h_scale : int + Scalar of the height. + mode : str, optional + Interpolation mode, modes suppored are "nearest", "bilinear", + "trilinear", "bicubic", "linear" and "area",by default "nearest" + + For more details about interpolation, see + `paddle.nn.functional.interpolate `_. + """ super().__init__() - self.x_scale = x_scale - self.y_scale = y_scale + self.w_scale = w_scale + self.h_scale = h_scale self.mode = mode - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: + """ + Parameters + ---------- + x : Tensor + Shape (N, C, H, W) + + Returns + ------- + Tensor + Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``. + The stretched image. + """ out = F.interpolate( - x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode) + x, scale_factor=(self.h_scale, self.w_scale), mode=self.mode) return out class UpsampleNet(nn.Layer): + """A Layer to upsample spectrogram by applying consecutive stretch and + convolutions. + + Parameters + ---------- + upsample_scales : List[int] + Upsampling factors for each strech. + nonlinear_activation : Optional[str], optional + Activation after each convolution, by default None + nonlinear_activation_params : Dict[str, Any], optional + Parameters passed to construct the activation, by default {} + interpolate_mode : str, optional + Interpolation mode of the strech, by default "nearest" + freq_axis_kernel_size : int, optional + Convolution kernel size along the frequency axis, by default 1 + use_causal_conv : bool, optional + Whether to use causal padding before convolution, by default False + + If True, Causal padding is used along the time axis, i.e. padding + amount is ``receptive field - 1`` and 0 for before and after, + respectively. + + If False, "same" padding is used along the time axis. + """ + def __init__(self, - upsample_scales, - nonlinear_activation=None, - nonlinear_activation_params={}, - interpolate_mode="nearest", - freq_axis_kernel_size=1, - use_causal_conv=False): + upsample_scales: List[int], + nonlinear_activation: Optional[str]=None, + nonlinear_activation_params: Dict[str, Any]={}, + interpolate_mode: str="nearest", + freq_axis_kernel_size: int=1, + use_causal_conv: bool=False): super().__init__() self.use_causal_conv = use_causal_conv self.up_layers = nn.LayerList() @@ -59,7 +114,19 @@ class UpsampleNet(nn.Layer): nn, nonlinear_activation)(**nonlinear_activation_params) self.up_layers.extend([stretch, conv, nonlinear]) - def forward(self, c): + def forward(self, c: Tensor) -> Tensor: + """ + Parameters + ---------- + c : Tensor + Shape (N, F, T), spectrogram + + Returns + ------- + Tensor + Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled + spectrogram + """ c = c.unsqueeze(1) for f in self.up_layers: if self.use_causal_conv and isinstance(f, nn.Conv2D): @@ -70,15 +137,48 @@ class UpsampleNet(nn.Layer): class ConvInUpsampleNet(nn.Layer): + """A Layer to upsample spectrogram composed of a convolution and an + UpsampleNet. + + Parameters + ---------- + upsample_scales : List[int] + Upsampling factors for each strech. + nonlinear_activation : Optional[str], optional + Activation after each convolution, by default None + nonlinear_activation_params : Dict[str, Any], optional + Parameters passed to construct the activation, by default {} + interpolate_mode : str, optional + Interpolation mode of the strech, by default "nearest" + freq_axis_kernel_size : int, optional + Convolution kernel size along the frequency axis, by default 1 + aux_channels : int, optional + Feature size of the input, by default 80 + aux_context_window : int, optional + Context window of the first 1D convolution applied to the input. It + related to the kernel size of the convolution, by default 0 + + If use causal convolution, the kernel size is ``window + 1``, else + the kernel size is ``2 * window + 1``. + use_causal_conv : bool, optional + Whether to use causal padding before convolution, by default False + + If True, Causal padding is used along the time axis, i.e. padding + amount is ``receptive field - 1`` and 0 for before and after, + respectively. + + If False, "same" padding is used along the time axis. + """ + def __init__(self, - upsample_scales, - nonlinear_activation=None, - nonlinear_activation_params={}, - interpolate_mode="nearest", - freq_axis_kernel_size=1, - aux_channels=80, - aux_context_window=0, - use_causal_conv=False): + upsample_scales: List[int], + nonlinear_activation: Optional[str]=None, + nonlinear_activation_params: Dict[str, Any]={}, + interpolate_mode: str="nearest", + freq_axis_kernel_size: int=1, + aux_channels: int=80, + aux_context_window: int=0, + use_causal_conv: bool=False): super().__init__() self.aux_context_window = aux_context_window self.use_causal_conv = use_causal_conv and aux_context_window > 0 @@ -96,23 +196,61 @@ class ConvInUpsampleNet(nn.Layer): freq_axis_kernel_size=freq_axis_kernel_size, use_causal_conv=use_causal_conv) - def forward(self, c): + def forward(self, c: Tensor) -> Tensor: + """ + Parameters + ---------- + c : Tensor + Shape (N, F, T), spectrogram + + Returns + ------- + Tensors + Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled + spectrogram + """ c_ = self.conv_in(c) c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_ return self.upsample(c) class ResidualBlock(nn.Layer): + """A gated activation unit composed of an 1D convolution, a gated tanh + unit and parametric redidual and skip connections. For more details, + refer to `WaveNet: A Generative Model for Raw Audio `_. + + Parameters + ---------- + kernel_size : int, optional + Kernel size of the 1D convolution, by default 3 + residual_channels : int, optional + Feature size of the resiaudl output(and also the input), by default 64 + gate_channels : int, optional + Output feature size of the 1D convolution, by default 128 + skip_channels : int, optional + Feature size of the skip output, by default 64 + aux_channels : int, optional + Feature size of the auxiliary input (e.g. spectrogram), by default 80 + dropout : float, optional + Probability of the dropout before the 1D convolution, by default 0. + dilation : int, optional + Dilation of the 1D convolution, by default 1 + bias : bool, optional + Whether to use bias in the 1D convolution, by default True + use_causal_conv : bool, optional + Whether to use causal padding for the 1D convolution, by default False + """ + def __init__(self, - kernel_size=3, - residual_channels=64, - gate_channels=128, - skip_channels=64, - aux_channels=80, - dropout=0., - dilation=1, - bias=True, - use_causal_conv=False): + kernel_size: int=3, + residual_channels: int=64, + gate_channels: int=128, + skip_channels: int=64, + aux_channels: int=80, + dropout: float=0., + dilation: int=1, + bias: bool=True, + use_causal_conv: bool=False): super().__init__() self.dropout = dropout if use_causal_conv: @@ -144,7 +282,24 @@ class ResidualBlock(nn.Layer): self.conv1x1_skip = nn.Conv1D( gate_out_channels, skip_channels, kernel_size=1, bias_attr=bias) - def forward(self, x, c): + def forward(self, x: Tensor, c: Tensor) -> Tuple[Tensor, Tensor]: + """ + Parameters + ---------- + x : Tensor + Shape (N, C_res, T), the input features. + c : Tensor + Shape (N, C_aux, T), he auxiliary input. + + Returns + ------- + res : Tensor + Shape (N, C_res, T), the residual output, which is used as the + input of the next ResidualBlock in a stack of ResidualBlocks. + skip : Tensor + Shape (N, C_skip, T), the skip output, which is collected among + each layer in a stack of ResidualBlocks. + """ x_input = x x = F.dropout(x, self.dropout, training=self.training) x = self.conv(x) @@ -162,26 +317,76 @@ class ResidualBlock(nn.Layer): class PWGGenerator(nn.Layer): + """Wave Generator for Parallel WaveGAN + + Parameters + ---------- + in_channels : int, optional + Number of channels of the input waveform, by default 1 + out_channels : int, optional + Number of channels of the output waveform, by default 1 + kernel_size : int, optional + Kernel size of the residual blocks inside, by default 3 + layers : int, optional + Number of residual blocks inside, by default 30 + stacks : int, optional + The number of groups to split the residual blocks into, by default 3 + + Within each group, the dilation of the residual block grows + exponentially. + residual_channels : int, optional + Residual channel of the residual blocks, by default 64 + gate_channels : int, optional + Gate channel of the residual blocks, by default 128 + skip_channels : int, optional + Skip channel of the residual blocks, by default 64 + aux_channels : int, optional + Auxiliary channel of the residual blocks, by default 80 + aux_context_window : int, optional + The context window size of the first convolution applied to the + auxiliary input, by default 2 + dropout : float, optional + Dropout of the residual blocks, by default 0. + bias : bool, optional + Whether to use bias in residual blocks, by default True + use_weight_norm : bool, optional + Whether to use weight norm in all convolutions, by default True + use_causal_conv : bool, optional + Whether to use causal padding in the upsample network and residual + blocks, by default False + upsample_scales : List[int], optional + Upsample scales of the upsample network, by default [4, 4, 4, 4] + nonlinear_activation : Optional[str], optional + Non linear activation in upsample network, by default None + nonlinear_activation_params : Dict[str, Any], optional + Parameters passed to the linear activation in the upsample network, + by default {} + interpolate_mode : str, optional + Interpolation mode of the upsample network, by default "nearest" + freq_axis_kernel_size : int, optional + Kernel size along the frequency axis of the upsample network, by default 1 + """ + def __init__(self, - in_channels=1, - out_channels=1, - kernel_size=3, - layers=30, - stacks=3, - residual_channels=64, - gate_channels=128, - skip_channels=64, - aux_channels=80, - aux_context_window=2, - dropout=0., - bias=True, - use_weight_norm=True, - use_causal_conv=False, - upsample_scales=[4, 4, 4, 4], - nonlinear_activation=None, - nonlinear_activation_params={}, - interpolate_mode="nearest", - freq_axis_kernel_size=1): + in_channels: int=1, + out_channels: int=1, + kernel_size: int=3, + layers: int=30, + stacks: int=3, + residual_channels: int=64, + gate_channels: int=128, + skip_channels: int=64, + aux_channels: int=80, + aux_context_window: int=2, + dropout: float=0., + bias: bool=True, + use_weight_norm: bool=True, + use_causal_conv: bool=False, + upsample_scales: List[int]=[4, 4, 4, 4], + nonlinear_activation: Optional[str]=None, + nonlinear_activation_params: Dict[str, Any]={}, + interpolate_mode: str="nearest", + freq_axis_kernel_size: int=1): super().__init__() self.in_channels = in_channels self.out_channels = out_channels @@ -233,10 +438,24 @@ class PWGGenerator(nn.Layer): if use_weight_norm: self.apply_weight_norm() - def forward(self, x, c): - if c is not None: - c = self.upsample_net(c) - assert c.shape[-1] == x.shape[-1] + def forward(self, x: Tensor, c: Tensor) -> Tensor: + """Generate waveform. + + Parameters + ---------- + x : Tensor + Shape (N, C_in, T), The input waveform. + c : Tensor + Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It + is upsampled to match the time resolution of the input. + + Returns + ------- + Tensor + Shape (N, C_out, T), the generated waveform. + """ + c = self.upsample_net(c) + assert c.shape[-1] == x.shape[-1] x = self.first_conv(x) skips = 0 @@ -249,6 +468,10 @@ class PWGGenerator(nn.Layer): return x def apply_weight_norm(self): + """Recursively apply weight normalization to all the Convolution layers + in the sublayers. + """ + def _apply_weight_norm(layer): if isinstance(layer, (nn.Conv1D, nn.Conv2D)): nn.utils.weight_norm(layer) @@ -256,6 +479,10 @@ class PWGGenerator(nn.Layer): self.apply(_apply_weight_norm) def remove_weight_norm(self): + """Recursively remove weight normalization from all the Convolution + layers in the sublayers. + """ + def _remove_weight_norm(layer): try: nn.utils.remove_weight_norm(layer) @@ -264,17 +491,30 @@ class PWGGenerator(nn.Layer): self.apply(_remove_weight_norm) - def inference(self, c=None, x=None): - """ - single instance inference - c: [T', C] condition - x: [T, 1] noise + def inference(self, c: Optional[Tensor]=None, + x: Optional[Tensor]=None) -> Tensor: + """Waveform generation. This function is used for single instance + inference. + + Parameters + ---------- + c : Tensor, optional + Shape (T', C_aux), the auxiliary input, by default None + x : Tensor, optional + Shape (T, C_in), the noise waveform, by default None + If not provided, a sample is drawn from a gaussian distribution. + + Returns + ------- + Tensor + Shape (T, C_out), the generated waveform """ if x is not None: x = paddle.transpose(x, [1, 0]).unsqueeze(0) # pseudo batch else: assert c is not None - x = paddle.randn([1, 1, c.shape[0] * self.upsample_factor]) + x = paddle.randn( + [1, self.in_channels, c.shape[0] * self.upsample_factor]) if c is not None: c = paddle.transpose(c, [1, 0]).unsqueeze(0) # pseudo batch From 0067851950281a48ebebda8e24a695b6200e3f08 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Fri, 11 Jun 2021 05:26:22 +0800 Subject: [PATCH 17/47] fix imports --- parakeet/models/parallel_wavegan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parakeet/models/parallel_wavegan.py b/parakeet/models/parallel_wavegan.py index aded823..00a48a4 100644 --- a/parakeet/models/parallel_wavegan.py +++ b/parakeet/models/parallel_wavegan.py @@ -13,7 +13,7 @@ # limitations under the License. import math -from typing import List, Dict, Any, Union, Optional +from typing import List, Dict, Any, Union, Optional, Tuple import numpy as np import paddle From 54c7905f403bb2cbeb520b3228d3529d6d3339b3 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Sun, 13 Jun 2021 17:00:44 +0800 Subject: [PATCH 18/47] WIP: training setup done --- .gitignore | 2 + examples/parallelwave_gan/baker/batch_fn.py | 105 +++++++ .../parallelwave_gan/baker/conf/default.yaml | 127 ++++++++ examples/parallelwave_gan/baker/config.py | 25 ++ examples/parallelwave_gan/baker/preprocess.py | 280 ++++++++++++++++++ examples/parallelwave_gan/baker/train.py | 166 +++++++++++ parakeet/models/parallel_wavegan.py | 3 +- 7 files changed, 707 insertions(+), 1 deletion(-) create mode 100644 examples/parallelwave_gan/baker/batch_fn.py create mode 100644 examples/parallelwave_gan/baker/conf/default.yaml create mode 100644 examples/parallelwave_gan/baker/config.py create mode 100644 examples/parallelwave_gan/baker/preprocess.py create mode 100644 examples/parallelwave_gan/baker/train.py diff --git a/.gitignore b/.gitignore index 7906666..af9563d 100644 --- a/.gitignore +++ b/.gitignore @@ -142,3 +142,5 @@ dmypy.json *.swp runs syn_audios +exp/ +dump/ diff --git a/examples/parallelwave_gan/baker/batch_fn.py b/examples/parallelwave_gan/baker/batch_fn.py new file mode 100644 index 0000000..71761e9 --- /dev/null +++ b/examples/parallelwave_gan/baker/batch_fn.py @@ -0,0 +1,105 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle + + +class Clip(object): + """Collate functor for training vocoders. + """ + + def __init__( + self, + batch_max_steps=20480, + hop_size=256, + aux_context_window=0, ): + """Initialize customized collater for PyTorch DataLoader. + + Args: + batch_max_steps (int): The maximum length of input signal in batch. + hop_size (int): Hop size of auxiliary features. + aux_context_window (int): Context window size for auxiliary feature conv. + + """ + if batch_max_steps % hop_size != 0: + batch_max_steps += -(batch_max_steps % hop_size) + assert batch_max_steps % hop_size == 0 + self.batch_max_steps = batch_max_steps + self.batch_max_frames = batch_max_steps // hop_size + self.hop_size = hop_size + self.aux_context_window = aux_context_window + + # set useful values in random cutting + self.start_offset = aux_context_window + self.end_offset = -(self.batch_max_frames + aux_context_window) + self.mel_threshold = self.batch_max_frames + 2 * aux_context_window + + def __call__(self, examples): + """Convert into batch tensors. + + Args: + batch (list): list of tuple of the pair of audio and features. + + Returns: + Tensor: Auxiliary feature batch (B, C, T'), where + T = (T' - 2 * aux_context_window) * hop_size. + Tensor: Target signal batch (B, 1, T). + + """ + # check length + examples = [ + self._adjust_length(*b) for b in examples + if len(b[1]) > self.mel_threshold + ] + xs, cs = [b[0] for b in examples], [b[1] for b in examples] + + # make batch with random cut + c_lengths = [len(c) for c in cs] + start_frames = np.array([ + np.random.randint(self.start_offset, cl + self.end_offset) + for cl in c_lengths + ]) + x_starts = start_frames * self.hop_size + x_ends = x_starts + self.batch_max_steps + + c_starts = start_frames - self.aux_context_window + c_ends = start_frames + self.batch_max_frames + self.aux_context_window + y_batch = [x[start:end] for x, start, end in zip(xs, x_starts, x_ends)] + c_batch = [c[start:end] for c, start, end in zip(cs, c_starts, c_ends)] + + # convert each batch to tensor, asuume that each item in batch has the same length + y_batch = paddle.to_tensor( + y_batch, dtype=paddle.float32).unsqueeze(1) # (B, 1, T) + c_batch = paddle.to_tensor( + c_batch, dtype=paddle.float32).transpose([0, 2, 1]) # (B, C, T') + + return (c_batch, ), y_batch + + def _adjust_length(self, x, c): + """Adjust the audio and feature lengths. + + Note: + Basically we assume that the length of x and c are adjusted + through preprocessing stage, but if we use other library processed + features, this process will be needed. + + """ + if len(x) < len(c) * self.hop_size: + x = np.pad(x, (0, len(c) * self.hop_size - len(x)), mode="edge") + + # check the legnth is valid + assert len(x) == len(c) * self.hop_size + + return x, c diff --git a/examples/parallelwave_gan/baker/conf/default.yaml b/examples/parallelwave_gan/baker/conf/default.yaml new file mode 100644 index 0000000..ee888a2 --- /dev/null +++ b/examples/parallelwave_gan/baker/conf/default.yaml @@ -0,0 +1,127 @@ +# This is the hyperparameter configuration file for Parallel WaveGAN. +# Please make sure this is adjusted for the CSMSC dataset. If you want to +# apply to the other dataset, you might need to carefully change some parameters. +# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN. + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +sr: 24000 # Sampling rate. +n_fft: 2048 # FFT size. +hop_length: 300 # Hop size. +win_length: 1200 # Window length. + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. +n_mels: 80 # Number of mel basis. +fmin: 80 # Minimum freq in mel basis calculation. +fmax: 7600 # Maximum frequency in mel basis calculation. +# global_gain_scale: 1.0 # Will be multiplied to all of waveform. +trim_silence: false # Whether to trim the start and end of silence. +top_db: 60 # Need to tune carefully if the recording is not good. +trim_frame_length: 2048 # Frame size in trimming. +trim_hop_length: 512 # Hop size in trimming. +# format: "npy" # Feature file format. "npy" or "hdf5" is supported. + +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### +generator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_size: 3 # Kernel size of dilated convolution. + layers: 30 # Number of residual block layers. + stacks: 3 # Number of stacks i.e., dilation cycles. + residual_channels: 64 # Number of channels in residual conv. + gate_channels: 128 # Number of channels in gated conv. + skip_channels: 64 # Number of channels in skip conv. + aux_channels: 80 # Number of channels for auxiliary feature conv. + # Must be the same as num_mels. + aux_context_window: 2 # Context window size for auxiliary feature. + # If set to 2, previous 2 and future 2 frames will be considered. + dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. + bias: true # use bias in residual blocks + use_weight_norm: true # Whether to use weight norm. + # If set to true, it will be applied to all of the conv layers. + use_causal_conv: false # use causal conv in residual blocks and upsample layers + # upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture. + upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size. + interpolate_mode: "nearest" # upsample net interpolate mode + freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis + nonlinear_activation: null + nonlinear_activation_params: {} + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### +discriminator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_size: 3 # Number of output channels. + layers: 10 # Number of conv layers. + conv_channels: 64 # Number of chnn layers. + bias: true # Whether to use bias parameter in conv. + use_weight_norm: true # Whether to use weight norm. + # If set to true, it will be applied to all of the conv layers. + nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv. + nonlinear_activation_params: # Nonlinear function parameters + negative_slope: 0.2 # Alpha in LeakyReLU. + +########################################################### +# STFT LOSS SETTING # +########################################################### +stft_loss_params: + fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. + hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss + win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. + window: "hann" # Window function for STFT-based loss + +########################################################### +# ADVERSARIAL LOSS SETTING # +########################################################### +lambda_adv: 4.0 # Loss balancing coefficient. + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 6 # Batch size. +batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size. +pin_memory: true # Whether to pin memory in Pytorch DataLoader. +num_workers: 2 # Number of workers in Pytorch DataLoader. +remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. +allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. + +########################################################### +# OPTIMIZER & SCHEDULER SETTING # +########################################################### +generator_optimizer_params: + epsilon: 1.0e-6 # Generator's epsilon. + weight_decay: 0.0 # Generator's weight decay coefficient. +generator_scheduler_params: + learning_rate: 0.0001 # Generator's learning rate. + step_size: 200000 # Generator's scheduler step size. + gamma: 0.5 # Generator's scheduler gamma. + # At each step size, lr will be multiplied by this parameter. +generator_grad_norm: 10 # Generator's gradient norm. +discriminator_optimizer_params: + epsilon: 1.0e-6 # Discriminator's epsilon. + weight_decay: 0.0 # Discriminator's weight decay coefficient. +discriminator_scheduler_params: + learning_rate: 0.00005 # Discriminator's learning rate. + step_size: 200000 # Discriminator's scheduler step size. + gamma: 0.5 # Discriminator's scheduler gamma. + # At each step size, lr will be multiplied by this parameter. +discriminator_grad_norm: 1 # Discriminator's gradient norm. + +########################################################### +# INTERVAL SETTING # +########################################################### +discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator. +train_max_steps: 400000 # Number of training steps. +save_interval_steps: 5000 # Interval steps to save checkpoint. +eval_interval_steps: 1000 # Interval steps to evaluate the network. +log_interval_steps: 100 # Interval steps to record the training log. + +########################################################### +# OTHER SETTING # +########################################################### +num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. diff --git a/examples/parallelwave_gan/baker/config.py b/examples/parallelwave_gan/baker/config.py new file mode 100644 index 0000000..f555791 --- /dev/null +++ b/examples/parallelwave_gan/baker/config.py @@ -0,0 +1,25 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import yaml +from yacs.config import CfgNode as Configuration + +with open("conf/default.yaml", 'rt') as f: + _C = yaml.safe_load(f) + _C = Configuration(_C) + + +def get_cfg_default(): + config = _C.clone() + return config diff --git a/examples/parallelwave_gan/baker/preprocess.py b/examples/parallelwave_gan/baker/preprocess.py new file mode 100644 index 0000000..9bf9623 --- /dev/null +++ b/examples/parallelwave_gan/baker/preprocess.py @@ -0,0 +1,280 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Dict, Any +import soundfile as sf +import librosa +import numpy as np +from config import get_cfg_default +import argparse +import yaml +import json +import dacite +import dataclasses +import concurrent.futures +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor +from pathlib import Path +import tqdm +from operator import itemgetter +from praatio import tgio +import logging + + +def logmelfilterbank(audio, + sr, + n_fft=1024, + hop_length=256, + win_length=None, + window="hann", + n_mels=80, + fmin=None, + fmax=None, + eps=1e-10): + """Compute log-Mel filterbank feature. + + Parameters + ---------- + audio : ndarray + Audio signal (T,). + sr : int + Sampling rate. + n_fft : int + FFT size. (Default value = 1024) + hop_length : int + Hop size. (Default value = 256) + win_length : int + Window length. If set to None, it will be the same as fft_size. (Default value = None) + window : str + Window function type. (Default value = "hann") + n_mels : int + Number of mel basis. (Default value = 80) + fmin : int + Minimum frequency in mel basis calculation. (Default value = None) + fmax : int + Maximum frequency in mel basis calculation. (Default value = None) + eps : float + Epsilon value to avoid inf in log calculation. (Default value = 1e-10) + + Returns + ------- + np.ndarray + Log Mel filterbank feature (#frames, num_mels). + + """ + # get amplitude spectrogram + x_stft = librosa.stft( + audio, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + pad_mode="reflect") + spc = np.abs(x_stft) # (#bins, #frames,) + + # get mel basis + fmin = 0 if fmin is None else fmin + fmax = sr / 2 if fmax is None else fmax + mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax) + + return np.log10(np.maximum(eps, np.dot(mel_basis, spc))) + + +def process_sentence(config: Dict[str, Any], + fp: Path, + alignment_fp: Path, + output_dir: Path): + utt_id = fp.stem + + # reading + y, sr = librosa.load(fp, sr=config.sr) # resampling may occur + assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio." + assert np.abs(y).max( + ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." + duration = librosa.get_duration(y, sr=sr) + + # trim according to the alignment file + alignment = tgio.openTextgrid(alignment_fp) + intervals = alignment.tierDict[alignment.tierNameList[0]].entryList + first, last = intervals[0], intervals[-1] + start = 0 + end = last.end + if first.label == "sil" and first.end < duration: + start = first.end + else: + logging.warning( + f" There is something wrong with the fisrt interval {first} in utterance: {utt_id}" + ) + if last.label == "sil" and last.start < duration: + end = last.start + else: + end = duration + logging.warning( + f" There is something wrong with the last interval {last} in utterance: {utt_id}" + ) + # silence trimmed + start, end = librosa.time_to_samples([first.end, last.start], sr=sr) + y = y[start:end] + + # energy based silence trimming + if config.trim_silence: + y, _ = librosa.effects.trim( + y, + top_db=config.top_db, + frame_length=config.trim_frame_length, + hop_length=config.trim_hop_length) + + logmel = logmelfilterbank( + y, + sr=sr, + n_fft=config.n_fft, + window=config.window, + win_length=config.win_length, + hop_length=config.hop_length, + n_mels=config.n_mels, + fmin=config.fmin, + fmax=config.fmax) + + # adjust time to make num_samples == num_frames * hop_length + num_frames = logmel.shape[1] + y = np.pad(y, (0, config.n_fft), mode="reflect") + y = y[:num_frames * config.hop_length] + num_sample = y.shape[0] + + mel_path = output_dir / (utt_id + "_feats.npy") + wav_path = output_dir / (utt_id + "_wave.npy") + np.save(wav_path, y) + np.save(mel_path, logmel) + record = { + "utt_id": utt_id, + "num_samples": num_sample, + "num_frames": num_frames, + "feats_path": str(mel_path.resolve()), + "wave_path": str(wav_path.resolve()), + } + return record + + +def process_sentences(config, + fps: List[Path], + alignment_fps: List[Path], + output_dir: Path, + nprocs: int=1): + if nprocs == 1: + results = [] + for fp, alignment_fp in tqdm.tqdm(zip(fps, alignment_fps)): + results.append( + process_sentence(config, fp, alignment_fp, output_dir)) + else: + with ProcessPoolExecutor(nprocs) as pool: + futures = [] + with tqdm.tqdm(total=len(fps)) as progress: + for fp, alignment_fp in zip(fps, alignment_fps): + future = pool.submit(process_sentence, config, fp, + alignment_fp, output_dir) + future.add_done_callback(lambda p: progress.update()) + futures.append(future) + + results = [] + for ft in futures: + results.append(ft.result()) + + results.sort(key=itemgetter("utt_id")) + with open(output_dir / "metadata.json", 'wt') as f: + json.dump(results, f) + print("Done") + + +def main(): + # parse config and args + parser = argparse.ArgumentParser( + description="Preprocess audio and then extract features (See detail in parallel_wavegan/bin/preprocess.py)." + ) + parser.add_argument( + "--rootdir", + default=None, + type=str, + help="directory including wav files. you need to specify either scp or rootdir." + ) + parser.add_argument( + "--dumpdir", + type=str, + required=True, + help="directory to dump feature files.") + parser.add_argument( + "--config", type=str, help="yaml format configuration file.") + parser.add_argument( + "--verbose", + type=int, + default=1, + help="logging level. higher is more logging. (default=1)") + parser.add_argument( + "--num_cpu", type=int, default=1, help="number of process.") + args = parser.parse_args() + + C = get_cfg_default() + if args.config: + C.merge_from_file(args.config) + C.freeze() + + if args.verbose > 1: + print(vars(args)) + print(yaml.dump(dataclasses.asdict(C))) + + root_dir = Path(args.rootdir) + dumpdir = Path(args.dumpdir) + dumpdir.mkdir(parents=True, exist_ok=True) + + wav_files = sorted(list((root_dir / "Wave").rglob("*.wav"))) + alignment_files = sorted( + list((root_dir / "PhoneLabeling").rglob("*.interval"))) + + # split data into 3 sections + train_wav_files = wav_files[:9800] + dev_wav_files = wav_files[9800:9900] + test_wav_files = wav_files[9900:] + + train_alignment_files = alignment_files[:9800] + dev_alignment_files = alignment_files[9800:9900] + test_alignment_files = alignment_files[9900:] + + train_dump_dir = dumpdir / "train" + train_dump_dir.mkdir(parents=True, exist_ok=True) + dev_dump_dir = dumpdir / "dev" + dev_dump_dir.mkdir(parents=True, exist_ok=True) + test_dump_dir = dumpdir / "test" + test_dump_dir.mkdir(parents=True, exist_ok=True) + + # process for the 3 sections + process_sentences( + C, + train_wav_files, + train_alignment_files, + train_dump_dir, + nprocs=args.num_cpu) + process_sentences( + C, + dev_wav_files, + dev_alignment_files, + dev_dump_dir, + nprocs=args.num_cpu) + process_sentences( + C, + test_wav_files, + test_alignment_files, + test_dump_dir, + nprocs=args.num_cpu) + + +if __name__ == "__main__": + main() diff --git a/examples/parallelwave_gan/baker/train.py b/examples/parallelwave_gan/baker/train.py new file mode 100644 index 0000000..bed3e21 --- /dev/null +++ b/examples/parallelwave_gan/baker/train.py @@ -0,0 +1,166 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import logging +import argparse +import dataclasses +from pathlib import Path + +import yaml +import dacite +import json +import paddle +import numpy as np +from paddle import nn +from paddle.nn import functional as F +from paddle import distributed as dist +from paddle.io import DataLoader, DistributedBatchSampler +from paddle.optimizer import Adam # No RAdaom +from paddle.optimizer.lr import StepDecay +from paddle import DataParallel +from visualdl import LogWriter + +from parakeet.datasets.data_table import DataTable +from parakeet.training.updater import UpdaterBase +from parakeet.training.trainer import Trainer +from parakeet.training.reporter import report +from parakeet.training.checkpoint import KBest, KLatest +from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator +from parakeet.modules.stft_loss import MultiResolutionSTFTLoss + +from config import get_cfg_default + + +def train_sp(args, config): + # decides device type and whether to run in parallel + # setup running environment correctly + if not paddle.is_compiled_with_cuda: + paddle.set_device("cpu") + else: + paddle.set_device("gpu") + world_size = paddle.distributed.get_world_size() + if world_size > 1: + paddle.distributed.init_parallel_env() + + print( + f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", + ) + + # construct dataset for training and validation + with open(args.train_metadata) as f: + train_metadata = json.load(f) + train_dataset = DataTable( + data=train_metadata, + fields=["wave_path", "feats_path"], + converters={ + "wave_path": np.load, + "feats_path": np.load, + }, ) + with open(args.dev_metadata) as f: + dev_metadata = json.load(f) + dev_dataset = DataTable( + data=dev_metadata, + fields=["wave_path", "feats_path"], + converters={ + "wave_path": np.load, + "feats_path": np.load, + }, ) + + # collate function and dataloader + train_sampler = DistributedBatchSampler( + train_dataset, + batch_size=config.batch_size, + shuffle=True, + drop_last=True) + dev_sampler = DistributedBatchSampler( + dev_dataset, + batch_size=config.batch_size, + shuffle=False, + drop_last=False) + + train_dataloader = DataLoader( + train_dataset, + batch_sampler=train_sampler, + collate_fn=None, # TODO(defaine collate fn) + num_workers=4) + dev_dataloader = DataLoader( + dev_dataset, + batch_sampler=dev_sampler, + collate_fn=None, # TODO(defaine collate fn) + num_workers=4) + + generator = PWGGenerator(**config["generator_params"]) + discriminator = PWGDiscriminator(**config["discriminator_params"]) + if world_size > 1: + generator = DataParallel(generator) + discriminator = DataParallel(discriminator) + criterion_stft = MultiResolutionSTFTLoss(**config["stft_loss_params"]) + criterion_mse = nn.MSELoss() + lr_schedule_g = StepDecay(**config["generator_scheduler_params"]) + optimizer_g = Adam( + lr_schedule_g, + parameters=generator.parameters(), + **config["generator_optimizer_params"]) + lr_schedule_d = StepDecay(**config["discriminator_scheduler_params"]) + optimizer_d = Adam( + lr_schedule_d, + parameters=discriminator.parameters(), + **config["discriminator_optimizer_params"]) + + output_dir = Path(args.output_dir) + log_writer = None + if dist.get_rank() == 0: + output_dir.mkdir(parents=True, exist_ok=True) + log_writer = LogWriter(output_dir) + + # training loop + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN " + "model with Baker Mandrin TTS dataset.") + parser.add_argument( + "--config", type=str, help="config file to overwrite default config") + parser.add_argument("--train-metadata", type=str, help="training data") + parser.add_argument("--dev-metadata", type=str, help="dev data") + parser.add_argument("--output-dir", type=str, help="output dir") + parser.add_argument( + "--nprocs", type=int, default=1, help="number of processes") + parser.add_argument("--verbose", type=int, default=1, help="verbose") + + args = parser.parse_args() + config = get_cfg_default() + if args.config: + config.merge_from_file(args.config) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(config) + print( + f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" + ) + + # dispatch + if args.nprocs > 1: + dist.spawn(train_sp, (args, config), nprocs=args.nprocs) + else: + train_sp(args, config) + + +if __name__ == "__main__": + main() diff --git a/parakeet/models/parallel_wavegan.py b/parakeet/models/parallel_wavegan.py index 00a48a4..e7d202d 100644 --- a/parakeet/models/parallel_wavegan.py +++ b/parakeet/models/parallel_wavegan.py @@ -109,10 +109,11 @@ class UpsampleNet(nn.Layer): padding = (freq_axis_padding, scale) conv = nn.Conv2D( 1, 1, kernel_size, padding=padding, bias_attr=False) + self.up_layers.extend([stretch, conv]) if nonlinear_activation is not None: nonlinear = getattr( nn, nonlinear_activation)(**nonlinear_activation_params) - self.up_layers.extend([stretch, conv, nonlinear]) + self.up_layers.append(nonlinear) def forward(self, c: Tensor) -> Tensor: """ From b0983e4d76a232943e3313dbab5f02822defb821 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Mon, 14 Jun 2021 17:05:37 +0800 Subject: [PATCH 19/47] WIP: pwg training works --- examples/parallelwave_gan/baker/batch_fn.py | 25 ++-- .../parallelwave_gan/baker/conf/default.yaml | 2 +- .../parallelwave_gan/baker/pwg_updater.py | 111 ++++++++++++++++++ examples/parallelwave_gan/baker/train.py | 53 ++++++++- parakeet/training/trainer.py | 7 +- parakeet/training/trigger.py | 4 +- parakeet/training/updater.py | 3 +- 7 files changed, 183 insertions(+), 22 deletions(-) create mode 100644 examples/parallelwave_gan/baker/pwg_updater.py diff --git a/examples/parallelwave_gan/baker/batch_fn.py b/examples/parallelwave_gan/baker/batch_fn.py index 71761e9..be22dbb 100644 --- a/examples/parallelwave_gan/baker/batch_fn.py +++ b/examples/parallelwave_gan/baker/batch_fn.py @@ -60,13 +60,13 @@ class Clip(object): """ # check length examples = [ - self._adjust_length(*b) for b in examples - if len(b[1]) > self.mel_threshold + self._adjust_length(b['wave_path'], b['feats_path']) + for b in examples if b['feats_path'].shape[1] > self.mel_threshold ] xs, cs = [b[0] for b in examples], [b[1] for b in examples] # make batch with random cut - c_lengths = [len(c) for c in cs] + c_lengths = [c.shape[1] for c in cs] start_frames = np.array([ np.random.randint(self.start_offset, cl + self.end_offset) for cl in c_lengths @@ -76,16 +76,17 @@ class Clip(object): c_starts = start_frames - self.aux_context_window c_ends = start_frames + self.batch_max_frames + self.aux_context_window - y_batch = [x[start:end] for x, start, end in zip(xs, x_starts, x_ends)] - c_batch = [c[start:end] for c, start, end in zip(cs, c_starts, c_ends)] + y_batch = np.stack( + [x[start:end] for x, start, end in zip(xs, x_starts, x_ends)]) + c_batch = np.stack( + [c[:, start:end] for c, start, end in zip(cs, c_starts, c_ends)]) # convert each batch to tensor, asuume that each item in batch has the same length y_batch = paddle.to_tensor( y_batch, dtype=paddle.float32).unsqueeze(1) # (B, 1, T) - c_batch = paddle.to_tensor( - c_batch, dtype=paddle.float32).transpose([0, 2, 1]) # (B, C, T') + c_batch = paddle.to_tensor(c_batch, dtype=paddle.float32) # (B, C, T') - return (c_batch, ), y_batch + return y_batch, c_batch def _adjust_length(self, x, c): """Adjust the audio and feature lengths. @@ -96,10 +97,12 @@ class Clip(object): features, this process will be needed. """ - if len(x) < len(c) * self.hop_size: - x = np.pad(x, (0, len(c) * self.hop_size - len(x)), mode="edge") + if len(x) < c.shape[1] * self.hop_size: + x = np.pad(x, (0, c.shape[1] * self.hop_size - len(x)), + mode="edge") # check the legnth is valid - assert len(x) == len(c) * self.hop_size + assert len(x) == c.shape[ + 1] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[1]})" return x, c diff --git a/examples/parallelwave_gan/baker/conf/default.yaml b/examples/parallelwave_gan/baker/conf/default.yaml index ee888a2..1cb1487 100644 --- a/examples/parallelwave_gan/baker/conf/default.yaml +++ b/examples/parallelwave_gan/baker/conf/default.yaml @@ -86,7 +86,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. batch_size: 6 # Batch size. batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size. pin_memory: true # Whether to pin memory in Pytorch DataLoader. -num_workers: 2 # Number of workers in Pytorch DataLoader. +num_workers: 0 # Number of workers in Pytorch DataLoader. remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. diff --git a/examples/parallelwave_gan/baker/pwg_updater.py b/examples/parallelwave_gan/baker/pwg_updater.py new file mode 100644 index 0000000..a7a7476 --- /dev/null +++ b/examples/parallelwave_gan/baker/pwg_updater.py @@ -0,0 +1,111 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle + +from parakeet.datasets.data_table import DataTable +from parakeet.training.updater import UpdaterBase, UpdaterState +from parakeet.training.trainer import Trainer +from parakeet.training.reporter import report +from parakeet.training.checkpoint import KBest, KLatest +from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator +from parakeet.modules.stft_loss import MultiResolutionSTFTLoss + + +class PWGUpdater(UpdaterBase): + def __init__( + self, + models, + optimizers, + criterions, + schedulers, + dataloaders, + discriminator_train_start_steps: int, + lambda_adv: float, ): + self.models = models + self.generator = models['generator'] + self.discriminator = models['discriminator'] + + self.optimizers = optimizers + self.optimizer_g = optimizers['generator'] + self.optimizer_d = optimizers['discriminator'] + + self.criterions = criterions + self.criterion_stft = criterions['stft'] + self.criterion_mse = criterions['mse'] + + self.schedulers = schedulers + self.scheduler_g = schedulers['generator'] + self.scheduler_d = schedulers['discriminator'] + + self.dataloaders = dataloaders + self.train_dataloader = dataloaders['train'] + self.dev_dataloader = dataloaders['dev'] + + self.discriminator_train_start_steps = discriminator_train_start_steps + self.lambda_adv = lambda_adv + self.state = UpdaterState(iteration=0, epoch=0) + + self.train_iterator = iter(self.train_dataloader) + + def update_core(self): + try: + batch = next(self.train_iterator) + except StopIteration: + self.train_iterator = iter(self.train_dataloader) + batch = next(self.train_iterator) + + wav, mel = batch + + # Generator + noise = paddle.randn(wav.shape) + wav_ = self.generator(noise, mel) + + ## Multi-resolution stft loss + sc_loss, mag_loss = self.criterion_stft( + wav_.squeeze(1), wav.squeeze(1)) + report("train/spectral_convergence_loss", float(sc_loss)) + report("train/log_stft_magnitude_loss", float(mag_loss)) + gen_loss = sc_loss + mag_loss + + ## Adversarial loss + if self.state.iteration > self.discriminator_train_start_steps: + p_ = self.discriminator(wav_) + adv_loss = self.criterion_mse(p_, paddle.ones_like(p_)) + report("train/adversarial_loss", float(adv_loss)) + gen_loss += self.lambda_adv * adv_loss + + report("train/generator_loss", float(gen_loss)) + self.optimizer_g.clear_grad() + gen_loss.backward() + self.optimizer_g.step() + self.scheduler_g.step() + + # Disctiminator + if self.state.iteration > self.discriminator_train_start_steps: + with paddle.no_grad(): + wav_ = self.generator(noise, mel) + p = self.discriminator(wav) + p_ = self.discriminator(wav_.detach()) + real_loss = self.criterion_mse(p, paddle.ones_like(p)) + fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_)) + report("train/real_loss", float(real_loss)) + report("train/fake_loss", float(fake_loss)) + dis_loss = real_loss + fake_loss + report("train/discriminator_loss", float(dis_loss)) + + self.optimizer_d.clear_grad() + dis_loss.backward() + self.optimizer_d.step() + self.scheduler_d.step() diff --git a/examples/parallelwave_gan/baker/train.py b/examples/parallelwave_gan/baker/train.py index bed3e21..42332f1 100644 --- a/examples/parallelwave_gan/baker/train.py +++ b/examples/parallelwave_gan/baker/train.py @@ -41,7 +41,9 @@ from parakeet.training.checkpoint import KBest, KLatest from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator from parakeet.modules.stft_loss import MultiResolutionSTFTLoss +from batch_fn import Clip from config import get_cfg_default +from pwg_updater import PWGUpdater def train_sp(args, config): @@ -90,25 +92,35 @@ def train_sp(args, config): batch_size=config.batch_size, shuffle=False, drop_last=False) + print("samplers done!") + train_batch_fn = Clip( + batch_max_steps=config.batch_max_steps, + hop_size=config.hop_length, + aux_context_window=config.generator_params.aux_context_window) train_dataloader = DataLoader( train_dataset, batch_sampler=train_sampler, - collate_fn=None, # TODO(defaine collate fn) - num_workers=4) + collate_fn=train_batch_fn, # TODO(defaine collate fn) + num_workers=config.num_workers) dev_dataloader = DataLoader( dev_dataset, batch_sampler=dev_sampler, - collate_fn=None, # TODO(defaine collate fn) - num_workers=4) + collate_fn=train_batch_fn, # TODO(defaine collate fn) + num_workers=config.num_workers) + print("dataloaders done!") generator = PWGGenerator(**config["generator_params"]) discriminator = PWGDiscriminator(**config["discriminator_params"]) if world_size > 1: generator = DataParallel(generator) discriminator = DataParallel(discriminator) + print("models done!") + criterion_stft = MultiResolutionSTFTLoss(**config["stft_loss_params"]) criterion_mse = nn.MSELoss() + print("criterions done!") + lr_schedule_g = StepDecay(**config["generator_scheduler_params"]) optimizer_g = Adam( lr_schedule_g, @@ -119,14 +131,43 @@ def train_sp(args, config): lr_schedule_d, parameters=discriminator.parameters(), **config["discriminator_optimizer_params"]) + print("optimizers done!") output_dir = Path(args.output_dir) log_writer = None if dist.get_rank() == 0: output_dir.mkdir(parents=True, exist_ok=True) - log_writer = LogWriter(output_dir) + log_writer = LogWriter(str(output_dir)) - # training loop + updater = PWGUpdater( + models={ + "generator": generator, + "discriminator": discriminator, + }, + optimizers={ + "generator": optimizer_g, + "discriminator": optimizer_d, + }, + criterions={ + "stft": criterion_stft, + "mse": criterion_mse, + }, + schedulers={ + "generator": lr_schedule_g, + "discriminator": lr_schedule_d, + }, + dataloaders={ + "train": train_dataloader, + "dev": dev_dataloader, + }, + discriminator_train_start_steps=config.discriminator_train_start_steps, + lambda_adv=config.lambda_adv, ) + + trainer = Trainer( + updater, + stop_trigger=(config.train_max_steps, "iteration"), + out=output_dir, ) + trainer.run() def main(): diff --git a/parakeet/training/trainer.py b/parakeet/training/trainer.py index 544ea8e..548df64 100644 --- a/parakeet/training/trainer.py +++ b/parakeet/training/trainer.py @@ -76,7 +76,7 @@ class Trainer(object): else: max_iteration = self.stop_trigger.period - while not stop_trigger(self): + while True: self.observation = {} # set observation as the report target # you can use report freely in Updater.update() @@ -84,8 +84,13 @@ class Trainer(object): # updating parameters and state with scope(self.observation): update() + print(self.observation) # execute extension when necessary for name, entry in extensions: if entry.trigger(self): entry.extension(self) + + if stop_trigger(self): + print("Training Done!") + break diff --git a/parakeet/training/trigger.py b/parakeet/training/trigger.py index a7d4ef9..5d165a6 100644 --- a/parakeet/training/trigger.py +++ b/parakeet/training/trigger.py @@ -23,9 +23,9 @@ class IntervalTrigger(object): def __call__(self, trainer): state = trainer.updater.state if self.unit == "epoch": - fire = not (state.epoch % self.period) + fire = state.epoch % self.period == 0 else: - fire = not (state.iteration % self.iteration) + fire = state.iteration % self.period == 0 return fire diff --git a/parakeet/training/updater.py b/parakeet/training/updater.py index 8359eef..c8383f2 100644 --- a/parakeet/training/updater.py +++ b/parakeet/training/updater.py @@ -57,7 +57,8 @@ class UpdaterBase(object): """ def update(self): - pass + self.state.iteration += 1 + self.update_core() def update_core(self): pass From 95f64c4f02cf05516f729cb229745e5c38a931a0 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Mon, 14 Jun 2021 17:21:45 +0800 Subject: [PATCH 20/47] WIP: add some trainig info --- examples/parallelwave_gan/baker/conf/default.yaml | 2 +- parakeet/training/trainer.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/parallelwave_gan/baker/conf/default.yaml b/examples/parallelwave_gan/baker/conf/default.yaml index 1cb1487..29d6c0f 100644 --- a/examples/parallelwave_gan/baker/conf/default.yaml +++ b/examples/parallelwave_gan/baker/conf/default.yaml @@ -86,7 +86,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. batch_size: 6 # Batch size. batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size. pin_memory: true # Whether to pin memory in Pytorch DataLoader. -num_workers: 0 # Number of workers in Pytorch DataLoader. +num_workers: 4 # Number of workers in Pytorch DataLoader. remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. diff --git a/parakeet/training/trainer.py b/parakeet/training/trainer.py index 548df64..ad3661f 100644 --- a/parakeet/training/trainer.py +++ b/parakeet/training/trainer.py @@ -76,6 +76,8 @@ class Trainer(object): else: max_iteration = self.stop_trigger.period + p = tqdm.tqdm() + while True: self.observation = {} # set observation as the report target @@ -84,12 +86,13 @@ class Trainer(object): # updating parameters and state with scope(self.observation): update() - print(self.observation) + p.update() + print(self.observation) - # execute extension when necessary - for name, entry in extensions: - if entry.trigger(self): - entry.extension(self) + # execute extension when necessary + for name, entry in extensions: + if entry.trigger(self): + entry.extension(self) if stop_trigger(self): print("Training Done!") From bbbe5a8b50e71390341c4d0e447fa09abab33608 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 16 Jun 2021 14:47:30 +0800 Subject: [PATCH 21/47] add profiling --- .../parallelwave_gan/baker/pwg_updater.py | 46 +++++++++++++------ examples/parallelwave_gan/baker/train.py | 15 ++++-- 2 files changed, 42 insertions(+), 19 deletions(-) diff --git a/examples/parallelwave_gan/baker/pwg_updater.py b/examples/parallelwave_gan/baker/pwg_updater.py index a7a7476..70507f2 100644 --- a/examples/parallelwave_gan/baker/pwg_updater.py +++ b/examples/parallelwave_gan/baker/pwg_updater.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle +from timer import timer from parakeet.datasets.data_table import DataTable from parakeet.training.updater import UpdaterBase, UpdaterState @@ -60,39 +61,54 @@ class PWGUpdater(UpdaterBase): self.train_iterator = iter(self.train_dataloader) def update_core(self): - try: - batch = next(self.train_iterator) - except StopIteration: - self.train_iterator = iter(self.train_dataloader) - batch = next(self.train_iterator) + with timer() as t: + try: + batch = next(self.train_iterator) + except StopIteration: + self.train_iterator = iter(self.train_dataloader) + batch = next(self.train_iterator) + print(f"Loading a batch takes {t.elapse}s") wav, mel = batch # Generator noise = paddle.randn(wav.shape) - wav_ = self.generator(noise, mel) + + with timer() as t: + wav_ = self.generator(noise, mel) + print(f"Generator takes {t.elapse}s") ## Multi-resolution stft loss - sc_loss, mag_loss = self.criterion_stft( - wav_.squeeze(1), wav.squeeze(1)) + with timer() as t: + sc_loss, mag_loss = self.criterion_stft( + wav_.squeeze(1), wav.squeeze(1)) + print(f"Multi-resolution STFT loss takes {t.elapse}s") + report("train/spectral_convergence_loss", float(sc_loss)) report("train/log_stft_magnitude_loss", float(mag_loss)) gen_loss = sc_loss + mag_loss ## Adversarial loss if self.state.iteration > self.discriminator_train_start_steps: - p_ = self.discriminator(wav_) - adv_loss = self.criterion_mse(p_, paddle.ones_like(p_)) + with timer() as t: + p_ = self.discriminator(wav_) + adv_loss = self.criterion_mse(p_, paddle.ones_like(p_)) + print(f"Discriminator and adversarial loss takes {t.elapse}s") report("train/adversarial_loss", float(adv_loss)) gen_loss += self.lambda_adv * adv_loss report("train/generator_loss", float(gen_loss)) - self.optimizer_g.clear_grad() - gen_loss.backward() - self.optimizer_g.step() - self.scheduler_g.step() + with timer() as t: + self.optimizer_g.clear_grad() + gen_loss.backward() + print(f"Backward takes {t.elapse}s.") - # Disctiminator + with timer() as t: + self.optimizer_g.step() + self.scheduler_g.step() + print(f"Update takes {t.elapse}s.") + +# Disctiminator if self.state.iteration > self.discriminator_train_start_steps: with paddle.no_grad(): wav_ = self.generator(noise, mel) diff --git a/examples/parallelwave_gan/baker/train.py b/examples/parallelwave_gan/baker/train.py index 42332f1..2e1af14 100644 --- a/examples/parallelwave_gan/baker/train.py +++ b/examples/parallelwave_gan/baker/train.py @@ -122,13 +122,18 @@ def train_sp(args, config): print("criterions done!") lr_schedule_g = StepDecay(**config["generator_scheduler_params"]) + gradient_clip_g = nn.ClipGradByGlobalNorm(config["generator_grad_norm"]) optimizer_g = Adam( - lr_schedule_g, + learning_rate=lr_schedule_g, + grad_clip=gradient_clip_g, parameters=generator.parameters(), **config["generator_optimizer_params"]) lr_schedule_d = StepDecay(**config["discriminator_scheduler_params"]) + gradient_clip_d = nn.ClipGradByGlobalNorm(config[ + "discriminator_grad_norm"]) optimizer_d = Adam( - lr_schedule_d, + learning_rate=lr_schedule_d, + grad_clip=gradient_clip_d, parameters=discriminator.parameters(), **config["discriminator_optimizer_params"]) print("optimizers done!") @@ -165,9 +170,11 @@ def train_sp(args, config): trainer = Trainer( updater, - stop_trigger=(config.train_max_steps, "iteration"), + stop_trigger=(10, "iteration"), # PROFILING out=output_dir, ) - trainer.run() + with paddle.fluid.profiler.cuda_profiler( + str(output_dir / "profiler.log")) as prof: + trainer.run() def main(): From 8dbcc9bccb75666407201392adad73189b67ffd9 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 16 Jun 2021 09:40:47 +0000 Subject: [PATCH 22/47] add profiling --- examples/parallelwave_gan/baker/preprocess.py | 7 +++---- examples/parallelwave_gan/baker/pwg_updater.py | 16 +++++++++++++++- examples/parallelwave_gan/baker/train.py | 1 - parakeet/modules/stft_loss.py | 2 +- setup.py | 1 - 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/examples/parallelwave_gan/baker/preprocess.py b/examples/parallelwave_gan/baker/preprocess.py index 9bf9623..23b5f05 100644 --- a/examples/parallelwave_gan/baker/preprocess.py +++ b/examples/parallelwave_gan/baker/preprocess.py @@ -16,12 +16,9 @@ from typing import List, Dict, Any import soundfile as sf import librosa import numpy as np -from config import get_cfg_default import argparse import yaml import json -import dacite -import dataclasses import concurrent.futures from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from pathlib import Path @@ -30,6 +27,8 @@ from operator import itemgetter from praatio import tgio import logging +from config import get_cfg_default + def logmelfilterbank(audio, sr, @@ -229,7 +228,7 @@ def main(): if args.verbose > 1: print(vars(args)) - print(yaml.dump(dataclasses.asdict(C))) + print(C) root_dir = Path(args.rootdir) dumpdir = Path(args.dumpdir) diff --git a/examples/parallelwave_gan/baker/pwg_updater.py b/examples/parallelwave_gan/baker/pwg_updater.py index 70507f2..3ab79b8 100644 --- a/examples/parallelwave_gan/baker/pwg_updater.py +++ b/examples/parallelwave_gan/baker/pwg_updater.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle +from paddle.fluid.core import _cuda_synchronize from timer import timer from parakeet.datasets.data_table import DataTable @@ -61,12 +62,15 @@ class PWGUpdater(UpdaterBase): self.train_iterator = iter(self.train_dataloader) def update_core(self): + place = paddle.fluid.framework._current_expected_place() with timer() as t: + _cuda_synchronize(place) try: batch = next(self.train_iterator) except StopIteration: self.train_iterator = iter(self.train_dataloader) batch = next(self.train_iterator) + _cuda_synchronize(place) print(f"Loading a batch takes {t.elapse}s") wav, mel = batch @@ -75,13 +79,17 @@ class PWGUpdater(UpdaterBase): noise = paddle.randn(wav.shape) with timer() as t: + _cuda_synchronize(place) wav_ = self.generator(noise, mel) + _cuda_synchronize(place) print(f"Generator takes {t.elapse}s") ## Multi-resolution stft loss with timer() as t: + _cuda_synchronize(place) sc_loss, mag_loss = self.criterion_stft( wav_.squeeze(1), wav.squeeze(1)) + _cuda_synchronize(place) print(f"Multi-resolution STFT loss takes {t.elapse}s") report("train/spectral_convergence_loss", float(sc_loss)) @@ -91,24 +99,30 @@ class PWGUpdater(UpdaterBase): ## Adversarial loss if self.state.iteration > self.discriminator_train_start_steps: with timer() as t: + _cuda_synchronize(place) p_ = self.discriminator(wav_) adv_loss = self.criterion_mse(p_, paddle.ones_like(p_)) + _cuda_synchronize(place) print(f"Discriminator and adversarial loss takes {t.elapse}s") report("train/adversarial_loss", float(adv_loss)) gen_loss += self.lambda_adv * adv_loss report("train/generator_loss", float(gen_loss)) with timer() as t: + _cuda_synchronize(place) self.optimizer_g.clear_grad() gen_loss.backward() + _cuda_synchronize(place) print(f"Backward takes {t.elapse}s.") with timer() as t: + _cuda_synchronize(place) self.optimizer_g.step() self.scheduler_g.step() + _cuda_synchronize(place) print(f"Update takes {t.elapse}s.") -# Disctiminator + # Disctiminator if self.state.iteration > self.discriminator_train_start_steps: with paddle.no_grad(): wav_ = self.generator(noise, mel) diff --git a/examples/parallelwave_gan/baker/train.py b/examples/parallelwave_gan/baker/train.py index 2e1af14..830bc83 100644 --- a/examples/parallelwave_gan/baker/train.py +++ b/examples/parallelwave_gan/baker/train.py @@ -20,7 +20,6 @@ import dataclasses from pathlib import Path import yaml -import dacite import json import paddle import numpy as np diff --git a/parakeet/modules/stft_loss.py b/parakeet/modules/stft_loss.py index 6c5bc9f..6531010 100644 --- a/parakeet/modules/stft_loss.py +++ b/parakeet/modules/stft_loss.py @@ -64,7 +64,7 @@ class STFTLoss(nn.Layer): fft_size=1024, shift_size=120, win_length=600, - window="hann_window"): + window="hann"): """Initialize STFT loss module.""" super().__init__() self.fft_size = fft_size diff --git a/setup.py b/setup.py index eefc922..6f112cc 100644 --- a/setup.py +++ b/setup.py @@ -64,7 +64,6 @@ setup_info = dict( 'scipy', 'pandas', 'sox', - # 'opencc', 'soundfile', 'g2p_en', 'yacs', From 042e02d242257461d4216c9c4bbcf4977d59bbb5 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 16 Jun 2021 13:43:13 +0000 Subject: [PATCH 23/47] use paddle's profiler --- examples/parallelwave_gan/baker/train.py | 5 +++-- parakeet/utils/h5_utils.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/parallelwave_gan/baker/train.py b/examples/parallelwave_gan/baker/train.py index 830bc83..d424c5d 100644 --- a/examples/parallelwave_gan/baker/train.py +++ b/examples/parallelwave_gan/baker/train.py @@ -171,8 +171,9 @@ def train_sp(args, config): updater, stop_trigger=(10, "iteration"), # PROFILING out=output_dir, ) - with paddle.fluid.profiler.cuda_profiler( - str(output_dir / "profiler.log")) as prof: + with paddle.fluid.profiler.profiler('All', 'total', + str(output_dir / "profiler.log"), + 'Default') as prof: trainer.run() diff --git a/parakeet/utils/h5_utils.py b/parakeet/utils/h5_utils.py index 229fe8d..7cdbfc6 100644 --- a/parakeet/utils/h5_utils.py +++ b/parakeet/utils/h5_utils.py @@ -48,7 +48,8 @@ def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any: f"There is no such a data in hdf5 file. ({dataset_name})") sys.exit(1) - hdf5_data = hdf5_file[dataset_name][()] # a special syntax of h5py + # [()]: a special syntax of h5py to get the dataset as-is + hdf5_data = hdf5_file[dataset_name][()] hdf5_file.close() return hdf5_data From 683cc1d30f3f8fbae93306c7e3207d93279f84a7 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 16 Jun 2021 13:44:23 +0000 Subject: [PATCH 24/47] add unit testings back --- tests/test_pwg.py | 130 +++++++++++++++++++++++++++++++++++++++++++++ tests/test_stft.py | 73 +++++++++++++++++++++++++ 2 files changed, 203 insertions(+) create mode 100644 tests/test_pwg.py create mode 100644 tests/test_stft.py diff --git a/tests/test_pwg.py b/tests/test_pwg.py new file mode 100644 index 0000000..45a480d --- /dev/null +++ b/tests/test_pwg.py @@ -0,0 +1,130 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import torch +from parallel_wavegan.layers import upsample, residual_block +from parallel_wavegan.models import parallel_wavegan as pwgan +from parakeet.utils.layer_tools import summary + +from parakeet.models.parallel_wavegan import ConvInUpsampleNet, ResidualBlock +from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator, ResidualPWGDiscriminator + + +def test_convin_upsample_net(): + net = ConvInUpsampleNet( + [4, 4, 4, 4], + "LeakyReLU", {"negative_slope": 0.2}, + freq_axis_kernel_size=3, + aux_context_window=0) + net2 = upsample.ConvInUpsampleNetwork( + [4, 4, 4, 4], + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + freq_axis_kernel_size=3, + aux_context_window=0) + summary(net) + for k, v in net2.named_parameters(): + print(k, v.shape) + net.state_dict()[k].set_value(v.data.cpu().numpy()) + + c = paddle.randn([4, 80, 180]) + out = net(c) + out2 = net2(torch.as_tensor(c.numpy())) + print(out.numpy()[0]) + print(out2.data.cpu().numpy()[0]) + + +def test_residual_block(): + net = ResidualBlock(dilation=9) + net2 = residual_block.ResidualBlock(dilation=9) + summary(net) + summary(net2) + for k, v in net2.named_parameters(): + net.state_dict()[k].set_value(v.data.cpu().numpy()) + + x = paddle.randn([4, 64, 180]) + c = paddle.randn([4, 80, 180]) + res, skip = net(x, c) + res2, skip2 = net2(torch.as_tensor(x.numpy()), torch.as_tensor(c.numpy())) + print(res.numpy()[0]) + print(res2.data.cpu().numpy()[0]) + print(skip.numpy()[0]) + print(skip2.data.cpu().numpy()[0]) + + +def test_pwg_generator(): + net = PWGGenerator( + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}) + net2 = pwgan.ParallelWaveGANGenerator(upsample_params={ + "upsample_scales": [4, 4, 4, 4], + "nonlinear_activation": "LeakyReLU", + "nonlinear_activation_params": { + "negative_slope": 0.2 + } + }) + summary(net) + summary(net2) + for k, v in net2.named_parameters(): + p = net.state_dict()[k] + if k.endswith("_g"): + p.set_value(v.data.cpu().numpy().reshape([-1])) + else: + p.set_value(v.data.cpu().numpy()) + x = paddle.randn([4, 1, 180 * 256]) + c = paddle.randn([4, 80, 180 + 4]) + out = net(x, c) + out2 = net2(torch.as_tensor(x.numpy()), torch.as_tensor(c.numpy())) + print(out.numpy()[0]) + print(out2.data.cpu().numpy()[0]) + # print(out.shape) + + +def test_pwg_discriminator(): + net = PWGDiscriminator() + net2 = pwgan.ParallelWaveGANDiscriminator() + summary(net) + summary(net2) + for k, v in net2.named_parameters(): + p = net.state_dict()[k] + if k.endswith("_g"): + p.set_value(v.data.cpu().numpy().reshape([-1])) + else: + p.set_value(v.data.cpu().numpy()) + x = paddle.randn([4, 1, 180 * 256]) + y = net(x) + y2 = net2(torch.as_tensor(x.numpy())) + print(y.numpy()[0]) + print(y2.data.cpu().numpy()[0]) + print(y.shape) + + +def test_residual_pwg_discriminator(): + net = ResidualPWGDiscriminator() + net2 = pwgan.ResidualParallelWaveGANDiscriminator() + summary(net) + summary(net2) + for k, v in net2.named_parameters(): + p = net.state_dict()[k] + if k.endswith("_g"): + p.set_value(v.data.cpu().numpy().reshape([-1])) + else: + p.set_value(v.data.cpu().numpy()) + x = paddle.randn([4, 1, 180 * 256]) + y = net(x) + y2 = net2(torch.as_tensor(x.numpy())) + print(y.numpy()[0]) + print(y2.data.cpu().numpy()[0]) + print(y.shape) diff --git a/tests/test_stft.py b/tests/test_stft.py new file mode 100644 index 0000000..c985235 --- /dev/null +++ b/tests/test_stft.py @@ -0,0 +1,73 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import torch +import librosa +import numpy as np +from parakeet.modules.stft_loss import STFT, MultiResolutionSTFTLoss +from parallel_wavegan.losses import stft_loss as sl +from scipy import signal + + +def test_stft(): + stft = STFT(n_fft=1024, hop_length=256, win_length=1024) + x = paddle.uniform([4, 46080]) + S = stft.magnitude(x) + window = signal.get_window('hann', 1024, fftbins=True) + D2 = torch.stft( + torch.as_tensor(x.numpy()), + n_fft=1024, + hop_length=256, + win_length=1024, + window=torch.as_tensor(window)) + S2 = (D2**2).sum(-1).sqrt() + S3 = np.abs( + librosa.stft( + x.numpy()[0], n_fft=1024, hop_length=256, win_length=1024)) + print(S2.shape) + print(S.numpy()[0]) + print(S2.data.cpu().numpy()[0]) + print(S3) + + +def test_torch_stft(): + # NOTE: torch.stft use no window by default + x = np.random.uniform(-1.0, 1.0, size=(46080, )) + window = signal.get_window('hann', 1024, fftbins=True) + D2 = torch.stft( + torch.as_tensor(x), + n_fft=1024, + hop_length=256, + win_length=1024, + window=torch.as_tensor(window)) + D3 = librosa.stft( + x, n_fft=1024, hop_length=256, win_length=1024, window='hann') + print(D2[:, :, 0].data.cpu().numpy()[:, 30:60]) + print(D3.real[:, 30:60]) + # print(D3.imag[:, 30:60]) + + +def test_multi_resolution_stft_loss(): + net = MultiResolutionSTFTLoss() + net2 = sl.MultiResolutionSTFTLoss() + + x = paddle.uniform([4, 46080]) + y = paddle.uniform([4, 46080]) + sc, m = net(x, y) + sc2, m2 = net2(torch.as_tensor(x.numpy()), torch.as_tensor(y.numpy())) + print(sc.numpy()) + print(sc2.data.cpu().numpy()) + print(m.numpy()) + print(m2.data.cpu().numpy()) From 27d3585606fa8bbca62ae986892e1c9dea36cbb3 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 16 Jun 2021 14:42:11 +0000 Subject: [PATCH 25/47] add some profiling to unittesting --- parakeet/utils/profile.py | 20 +++++++++++++ tests/test_pwg.py | 62 +++++++++++++++++++++++++++++++++++---- 2 files changed, 76 insertions(+), 6 deletions(-) create mode 100644 parakeet/utils/profile.py diff --git a/parakeet/utils/profile.py b/parakeet/utils/profile.py new file mode 100644 index 0000000..1e246eb --- /dev/null +++ b/parakeet/utils/profile.py @@ -0,0 +1,20 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle + + +def synchronize(): + place = paddle.fluid.framework._current_expected_place() + paddle.fluid.core._cuda_synchronize(place) diff --git a/tests/test_pwg.py b/tests/test_pwg.py index 45a480d..7f2205d 100644 --- a/tests/test_pwg.py +++ b/tests/test_pwg.py @@ -14,13 +14,18 @@ import paddle import torch +from timer import timer from parallel_wavegan.layers import upsample, residual_block from parallel_wavegan.models import parallel_wavegan as pwgan from parakeet.utils.layer_tools import summary +from parakeet.utils.profile import synchronize from parakeet.models.parallel_wavegan import ConvInUpsampleNet, ResidualBlock from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator, ResidualPWGDiscriminator +paddle.set_device("gpu:0") +device = torch.device("cuda:0") + def test_convin_upsample_net(): net = ConvInUpsampleNet( @@ -33,15 +38,34 @@ def test_convin_upsample_net(): nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, freq_axis_kernel_size=3, - aux_context_window=0) + aux_context_window=0).to(device) summary(net) for k, v in net2.named_parameters(): print(k, v.shape) net.state_dict()[k].set_value(v.data.cpu().numpy()) c = paddle.randn([4, 80, 180]) - out = net(c) - out2 = net2(torch.as_tensor(c.numpy())) + synchronize() + with timer(unit='s') as t: + out = net(c) + synchronize() + print(f"paddle conv_in_upsample_net forward takes {t.elapse}s.") + + with timer(unit='s') as t: + out.sum().backward() + synchronize() + print(f"paddle conv_in_upsample_net backward takes {t.elapse}s.") + + c_torch = torch.as_tensor(c.numpy()).to(device) + torch.cuda.synchronize() + with timer(unit='s') as t: + out2 = net2(c_torch) + print(f"torch conv_in_upsample_net forward takes {t.elapse}s.") + + with timer(unit='s') as t: + out2.sum().backward() + print(f"torch conv_in_upsample_net backward takes {t.elapse}s.") + print(out.numpy()[0]) print(out2.data.cpu().numpy()[0]) @@ -74,7 +98,7 @@ def test_pwg_generator(): "nonlinear_activation_params": { "negative_slope": 0.2 } - }) + }).to(device) summary(net) summary(net2) for k, v in net2.named_parameters(): @@ -85,8 +109,34 @@ def test_pwg_generator(): p.set_value(v.data.cpu().numpy()) x = paddle.randn([4, 1, 180 * 256]) c = paddle.randn([4, 80, 180 + 4]) - out = net(x, c) - out2 = net2(torch.as_tensor(x.numpy()), torch.as_tensor(c.numpy())) + + synchronize() + with timer(unit='s') as t: + out = net(x, c) + synchronize() + print(f"paddle generator forward takes {t.elapse}s.") + + synchronize() + with timer(unit='s') as t: + out.sum().backward() + synchronize() + print(f"paddle generator backward takes {t.elapse}s.") + + x_torch = torch.as_tensor(x.numpy()).to(device) + c_torch = torch.as_tensor(c.numpy()).to(device) + + torch.cuda.synchronize() + with timer(unit='s') as t: + out2 = net2(x_torch, c_torch) + torch.cuda.synchronize() + print(f"torch generator forward takes {t.elapse}s.") + + torch.cuda.synchronize() + with timer(unit='s') as t: + out2.sum().backward() + torch.cuda.synchronize() + print(f"torch generator backward takes {t.elapse}s.") + print(out.numpy()[0]) print(out2.data.cpu().numpy()[0]) # print(out.shape) From 58a988c7895941b79efc34a7e74200426c26d328 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 16 Jun 2021 16:18:13 +0000 Subject: [PATCH 26/47] add code to compute statistics --- .../baker/compute_statistics.py | 114 ++++++++++++++++++ examples/parallelwave_gan/baker/normalize.py | 13 ++ parakeet/utils/h5_utils.py | 2 +- setup.py | 2 + 4 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 examples/parallelwave_gan/baker/compute_statistics.py create mode 100644 examples/parallelwave_gan/baker/normalize.py diff --git a/examples/parallelwave_gan/baker/compute_statistics.py b/examples/parallelwave_gan/baker/compute_statistics.py new file mode 100644 index 0000000..ea696f8 --- /dev/null +++ b/examples/parallelwave_gan/baker/compute_statistics.py @@ -0,0 +1,114 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Calculate statistics of feature files.""" + +import argparse +import logging +import os + +import numpy as np +import yaml +import json + +from sklearn.preprocessing import StandardScaler +from tqdm import tqdm + +from parakeet.datasets.data_table import DataTable +from parakeet.utils.h5_utils import read_hdf5 +from parakeet.utils.h5_utils import write_hdf5 + +from config import get_cfg_default + + +def main(): + """Run preprocessing process.""" + parser = argparse.ArgumentParser( + description="Compute mean and variance of dumped raw features.") + parser.add_argument( + "--metadata", + default=None, + type=str, + help="json file with id and file paths ") + parser.add_argument( + "--field-name", + default=None, + type=str, + help="json file with id and file paths ") + parser.add_argument( + "--config", type=str, help="yaml format configuration file.") + parser.add_argument( + "--dumpdir", + default=None, + type=str, + help="directory to save statistics. if not provided, " + "stats will be saved in the above root directory. (default=None)") + parser.add_argument( + "--verbose", + type=int, + default=1, + help="logging level. higher is more logging. (default=1)") + args = parser.parse_args() + + # set logger + if args.verbose > 1: + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + ) + elif args.verbose > 0: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + ) + else: + logging.basicConfig( + level=logging.WARN, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + ) + logging.warning('Skip DEBUG/INFO messages') + + config = get_cfg_default() + # load config + if args.config: + config.merge_from_file(args.config) + + # check directory existence + if args.dumpdir is None: + args.dumpdir = os.path.dirname(args.metadata) + if not os.path.exists(args.dumpdir): + os.makedirs(args.dumpdir) + + with open(args.metadata, 'rt') as f: + metadata = json.load(f) + dataset = DataTable( + metadata, + fields=[args.field_name], + converters={args.field_name: np.load}, ) + logging.info(f"The number of files = {len(dataset)}.") + + # calculate statistics + scaler = StandardScaler() + for datum in tqdm(dataset): + # StandardScalar supports (*, num_features) by default + scaler.partial_fit(datum[args.field_name].T) + + stats = np.stack([scaler.mean_, scaler.scale_], axis=0) + np.save( + os.path.join(args.dumpdir, "stats.npy"), + stats.astype(np.float32), + allow_pickle=False) + + +if __name__ == "__main__": + main() diff --git a/examples/parallelwave_gan/baker/normalize.py b/examples/parallelwave_gan/baker/normalize.py new file mode 100644 index 0000000..185a92b --- /dev/null +++ b/examples/parallelwave_gan/baker/normalize.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/parakeet/utils/h5_utils.py b/parakeet/utils/h5_utils.py index 7cdbfc6..cd0c670 100644 --- a/parakeet/utils/h5_utils.py +++ b/parakeet/utils/h5_utils.py @@ -57,7 +57,7 @@ def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any: def write_hdf5(filename: Union[Path, str], dataset_name: str, - write_data: np.ndarrays, + write_data: np.ndarray, is_overwrite: bool=True) -> None: """Write dataset to HDF5 file. diff --git a/setup.py b/setup.py index 6f112cc..3a6c87e 100644 --- a/setup.py +++ b/setup.py @@ -72,6 +72,8 @@ setup_info = dict( 'webrtcvad', 'g2pM', 'praatio', + "h5py", + "timer", ], extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], }, From 30045cf6025cea11b93ce0d292ae4e2e7124d4c5 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 17 Jun 2021 03:12:35 +0000 Subject: [PATCH 27/47] move synchronize up before timer starts --- examples/parallelwave_gan/baker/pwg_updater.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/parallelwave_gan/baker/pwg_updater.py b/examples/parallelwave_gan/baker/pwg_updater.py index 3ab79b8..29f313b 100644 --- a/examples/parallelwave_gan/baker/pwg_updater.py +++ b/examples/parallelwave_gan/baker/pwg_updater.py @@ -78,15 +78,15 @@ class PWGUpdater(UpdaterBase): # Generator noise = paddle.randn(wav.shape) + _cuda_synchronize(place) with timer() as t: - _cuda_synchronize(place) wav_ = self.generator(noise, mel) _cuda_synchronize(place) print(f"Generator takes {t.elapse}s") ## Multi-resolution stft loss + _cuda_synchronize(place) with timer() as t: - _cuda_synchronize(place) sc_loss, mag_loss = self.criterion_stft( wav_.squeeze(1), wav.squeeze(1)) _cuda_synchronize(place) @@ -108,15 +108,16 @@ class PWGUpdater(UpdaterBase): gen_loss += self.lambda_adv * adv_loss report("train/generator_loss", float(gen_loss)) + + _cuda_synchronize(place) with timer() as t: - _cuda_synchronize(place) self.optimizer_g.clear_grad() gen_loss.backward() _cuda_synchronize(place) print(f"Backward takes {t.elapse}s.") + _cuda_synchronize(place) with timer() as t: - _cuda_synchronize(place) self.optimizer_g.step() self.scheduler_g.step() _cuda_synchronize(place) From fbc7e51fc9699d9a022a33270932eb0a460acc7a Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Fri, 18 Jun 2021 02:49:49 +0000 Subject: [PATCH 28/47] 1. add compute_statistics and normalize; 2. use jsonlines to read and write metadata by default; 3. use threadpool to replace processpool in preprocessing cause it is faster. --- .../baker/compute_statistics.py | 18 +-- examples/parallelwave_gan/baker/normalize.py | 141 ++++++++++++++++++ examples/parallelwave_gan/baker/preprocess.py | 22 +-- setup.py | 1 + 4 files changed, 160 insertions(+), 22 deletions(-) diff --git a/examples/parallelwave_gan/baker/compute_statistics.py b/examples/parallelwave_gan/baker/compute_statistics.py index ea696f8..4db003f 100644 --- a/examples/parallelwave_gan/baker/compute_statistics.py +++ b/examples/parallelwave_gan/baker/compute_statistics.py @@ -20,6 +20,7 @@ import os import numpy as np import yaml import json +import jsonlines from sklearn.preprocessing import StandardScaler from tqdm import tqdm @@ -36,20 +37,13 @@ def main(): parser = argparse.ArgumentParser( description="Compute mean and variance of dumped raw features.") parser.add_argument( - "--metadata", - default=None, - type=str, - help="json file with id and file paths ") + "--metadata", type=str, help="json file with id and file paths ") parser.add_argument( - "--field-name", - default=None, - type=str, - help="json file with id and file paths ") + "--field-name", type=str, help="json file with id and file paths ") parser.add_argument( "--config", type=str, help="yaml format configuration file.") parser.add_argument( "--dumpdir", - default=None, type=str, help="directory to save statistics. if not provided, " "stats will be saved in the above root directory. (default=None)") @@ -89,8 +83,8 @@ def main(): if not os.path.exists(args.dumpdir): os.makedirs(args.dumpdir) - with open(args.metadata, 'rt') as f: - metadata = json.load(f) + with jsonlines.open(args.metadata, 'r') as reader: + metadata = list(reader) dataset = DataTable( metadata, fields=[args.field_name], @@ -101,7 +95,7 @@ def main(): scaler = StandardScaler() for datum in tqdm(dataset): # StandardScalar supports (*, num_features) by default - scaler.partial_fit(datum[args.field_name].T) + scaler.partial_fit(datum[args.field_name]) stats = np.stack([scaler.mean_, scaler.scale_], axis=0) np.save( diff --git a/examples/parallelwave_gan/baker/normalize.py b/examples/parallelwave_gan/baker/normalize.py index 185a92b..6134917 100644 --- a/examples/parallelwave_gan/baker/normalize.py +++ b/examples/parallelwave_gan/baker/normalize.py @@ -11,3 +11,144 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2019 Tomoki Hayashi +# MIT License (https://opensource.org/licenses/MIT) +"""Normalize feature files and dump them.""" + +import argparse +import logging +import os +from operator import itemgetter +from pathlib import Path + +import numpy as np +import yaml +import jsonlines + +from sklearn.preprocessing import StandardScaler +from tqdm import tqdm + +from parakeet.datasets.data_table import DataTable +from parakeet.utils.h5_utils import read_hdf5 +from parakeet.utils.h5_utils import write_hdf5 + +from config import get_cfg_default + + +def main(): + """Run preprocessing process.""" + parser = argparse.ArgumentParser( + description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)." + ) + parser.add_argument( + "--metadata", + type=str, + required=True, + help="directory including feature files to be normalized. " + "you need to specify either *-scp or rootdir.") + parser.add_argument( + "--dumpdir", + type=str, + required=True, + help="directory to dump normalized feature files.") + parser.add_argument( + "--stats", type=str, required=True, help="statistics file.") + parser.add_argument( + "--skip-wav-copy", + default=False, + action="store_true", + help="whether to skip the copy of wav files.") + parser.add_argument( + "--config", type=str, help="yaml format configuration file.") + parser.add_argument( + "--verbose", + type=int, + default=1, + help="logging level. higher is more logging. (default=1)") + args = parser.parse_args() + + # set logger + if args.verbose > 1: + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + ) + elif args.verbose > 0: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + ) + else: + logging.basicConfig( + level=logging.WARN, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + ) + logging.warning('Skip DEBUG/INFO messages') + + # load config + config = get_cfg_default() + if args.config: + config.merge_from_file(args.config) + + # check directory existence + dumpdir = Path(args.dumpdir).resolve() + dumpdir.mkdir(parents=True, exist_ok=True) + + # get dataset + with jsonlines.open(args.metadata, 'r') as reader: + metadata = list(reader) + dataset = DataTable( + metadata, + fields=["utt_id", "wave", "feats"], + converters={ + 'utt_id': None, + 'wave': None if args.skip_wav_copy else np.load, + 'feats': np.load, + }) + logging.info(f"The number of files = {len(dataset)}.") + + # restore scaler + scaler = StandardScaler() + scaler.mean_ = np.load(args.stats)[0] + scaler.scale_ = np.load(args.stats)[1] + + # from version 0.23.0, this information is needed + scaler.n_features_in_ = scaler.mean_.shape[0] + + # process each file + output_metadata = [] + + for item in tqdm(dataset): + utt_id = item['utt_id'] + wave = item['wave'] + mel = item['feats'] + # normalize + mel = scaler.transform(mel) + + # save + mel_path = dumpdir / f"{utt_id}-feats.npy" + np.save(mel_path, mel.astype(np.float32), allow_pickle=False) + if not args.skip_wav_copy: + wav_path = dumpdir / f"{utt_id}-wave.npy" + np.save(wav_path, wave.astype(np.float32), allow_pickle=False) + else: + wav_path = wave + output_metadata.append({ + 'utt_id': utt_id, + 'wave': str(wav_path), + 'feats': str(mel_path), + }) + output_metadata.sort(key=itemgetter('utt_id')) + output_metadata_path = Path(args.dumpdir) / "metadata.jsonl" + with jsonlines.open(output_metadata_path, 'w') as writer: + for item in output_metadata: + writer.write(item) + logging.info(f"metadata dumped into {output_metadata_path}") + + +if __name__ == "__main__": + main() diff --git a/examples/parallelwave_gan/baker/preprocess.py b/examples/parallelwave_gan/baker/preprocess.py index 23b5f05..09f2004 100644 --- a/examples/parallelwave_gan/baker/preprocess.py +++ b/examples/parallelwave_gan/baker/preprocess.py @@ -19,6 +19,7 @@ import numpy as np import argparse import yaml import json +import jsonlines import concurrent.futures from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from pathlib import Path @@ -152,14 +153,14 @@ def process_sentence(config: Dict[str, Any], mel_path = output_dir / (utt_id + "_feats.npy") wav_path = output_dir / (utt_id + "_wave.npy") - np.save(wav_path, y) - np.save(mel_path, logmel) + np.save(wav_path, y) # (num_samples, ) + np.save(mel_path, logmel.T) # (num_frames, n_mels) record = { "utt_id": utt_id, "num_samples": num_sample, "num_frames": num_frames, - "feats_path": str(mel_path.resolve()), - "wave_path": str(wav_path.resolve()), + "feats": str(mel_path.resolve()), + "wave": str(wav_path.resolve()), } return record @@ -175,7 +176,7 @@ def process_sentences(config, results.append( process_sentence(config, fp, alignment_fp, output_dir)) else: - with ProcessPoolExecutor(nprocs) as pool: + with ThreadPoolExecutor(nprocs) as pool: futures = [] with tqdm.tqdm(total=len(fps)) as progress: for fp, alignment_fp in zip(fps, alignment_fps): @@ -189,8 +190,9 @@ def process_sentences(config, results.append(ft.result()) results.sort(key=itemgetter("utt_id")) - with open(output_dir / "metadata.json", 'wt') as f: - json.dump(results, f) + with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer: + for item in results: + writer.write(item) print("Done") @@ -247,11 +249,11 @@ def main(): dev_alignment_files = alignment_files[9800:9900] test_alignment_files = alignment_files[9900:] - train_dump_dir = dumpdir / "train" + train_dump_dir = dumpdir / "train" / "raw" train_dump_dir.mkdir(parents=True, exist_ok=True) - dev_dump_dir = dumpdir / "dev" + dev_dump_dir = dumpdir / "dev" / "raw" dev_dump_dir.mkdir(parents=True, exist_ok=True) - test_dump_dir = dumpdir / "test" + test_dump_dir = dumpdir / "test" / "raw" test_dump_dir.mkdir(parents=True, exist_ok=True) # process for the 3 sections diff --git a/setup.py b/setup.py index 3a6c87e..b7cb4da 100644 --- a/setup.py +++ b/setup.py @@ -74,6 +74,7 @@ setup_info = dict( 'praatio', "h5py", "timer", + 'jsonlines', ], extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], }, From 7ac0d3ce126ccfb8e84caae64df82002aa75583e Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Fri, 18 Jun 2021 03:09:46 +0000 Subject: [PATCH 29/47] add docstring to pwg --- parakeet/models/parallel_wavegan.py | 66 +++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/parakeet/models/parallel_wavegan.py b/parakeet/models/parallel_wavegan.py index e7d202d..9986e9e 100644 --- a/parakeet/models/parallel_wavegan.py +++ b/parakeet/models/parallel_wavegan.py @@ -290,7 +290,7 @@ class ResidualBlock(nn.Layer): x : Tensor Shape (N, C_res, T), the input features. c : Tensor - Shape (N, C_aux, T), he auxiliary input. + Shape (N, C_aux, T), the auxiliary input. Returns ------- @@ -525,17 +525,48 @@ class PWGGenerator(nn.Layer): class PWGDiscriminator(nn.Layer): + """A convolutional discriminator for audio. + + Parameters + ---------- + in_channels : int, optional + Number of channels of the input audio, by default 1 + out_channels : int, optional + Output feature size, by default 1 + kernel_size : int, optional + Kernel size of convolutional sublayers, by default 3 + layers : int, optional + Number of layers, by default 10 + conv_channels : int, optional + Feature size of the convolutional sublayers, by default 64 + dilation_factor : int, optional + The factor with which dilation of each convolutional sublayers grows + exponentially if it is greater than 1, else the dilation of each + convolutional sublayers grows linearly, by default 1 + nonlinear_activation : str, optional + The activation after each convolutional sublayer, by default "LeakyReLU" + nonlinear_activation_params : Dict[str, Any], optional + The parameters passed to the activation's initializer, by default + {"negative_slope": 0.2} + bias : bool, optional + Whether to use bias in convolutional sublayers, by default True + use_weight_norm : bool, optional + Whether to use weight normalization at all convolutional sublayers, + by default True + """ + def __init__(self, - in_channels=1, - out_channels=1, - kernel_size=3, - layers=10, - conv_channels=64, - dilation_factor=1, - nonlinear_activation="LeakyReLU", - nonlinear_activation_params={"negative_slope": 0.2}, - bias=True, - use_weight_norm=True): + in_channels: int=1, + out_channels: int=1, + kernel_size: int=3, + layers: int=10, + conv_channels: int=64, + dilation_factor: int=1, + nonlinear_activation: str="LeakyReLU", + nonlinear_activation_params: Dict[ + str, Any]={"negative_slope": 0.2}, + bias: bool=True, + use_weight_norm: bool=True): super().__init__() assert kernel_size % 2 == 1 assert dilation_factor > 0 @@ -572,7 +603,18 @@ class PWGDiscriminator(nn.Layer): if use_weight_norm: self.apply_weight_norm() - def forward(self, x): + def forward(self, x: Tensor): + """ + Parameters + ---------- + x : Tensor + Shape (N, in_channels, T), the input audio. + + Returns + ------- + Tensor + Shape (N, out_channels, T), the predicted logits. + """ return self.conv_layers(x) def apply_weight_norm(self): From 3977632b073773405f20885e89d1471204502eb7 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Fri, 18 Jun 2021 03:21:02 +0000 Subject: [PATCH 30/47] add docstrings to pwg --- parakeet/models/parallel_wavegan.py | 85 +++++++++++++++++++++++------ 1 file changed, 67 insertions(+), 18 deletions(-) diff --git a/parakeet/models/parallel_wavegan.py b/parakeet/models/parallel_wavegan.py index 9986e9e..6f2b44b 100644 --- a/parakeet/models/parallel_wavegan.py +++ b/parakeet/models/parallel_wavegan.py @@ -603,17 +603,17 @@ class PWGDiscriminator(nn.Layer): if use_weight_norm: self.apply_weight_norm() - def forward(self, x: Tensor): + def forward(self, x: Tensor) -> Tensor: """ Parameters ---------- x : Tensor - Shape (N, in_channels, T), the input audio. + Shape (N, in_channels, num_samples), the input audio. Returns ------- Tensor - Shape (N, out_channels, T), the predicted logits. + Shape (N, out_channels, num_samples), the predicted logits. """ return self.conv_layers(x) @@ -635,21 +635,59 @@ class PWGDiscriminator(nn.Layer): class ResidualPWGDiscriminator(nn.Layer): + """A wavenet-style discriminator for audio. + + Parameters + ---------- + in_channels : int, optional + Number of channels of the input audio, by default 1 + out_channels : int, optional + Output feature size, by default 1 + kernel_size : int, optional + Kernel size of residual blocks, by default 3 + layers : int, optional + Number of residual blocks, by default 30 + stacks : int, optional + Number of groups of residual blocks, within which the dilation + of each residual blocks grows exponentially, by default 3 + residual_channels : int, optional + Residual channels of residual blocks, by default 64 + gate_channels : int, optional + Gate channels of residual blocks, by default 128 + skip_channels : int, optional + Skip channels of residual blocks, by default 64 + dropout : float, optional + Dropout probability of residual blocks, by default 0. + bias : bool, optional + Whether to use bias in residual blocks, by default True + use_weight_norm : bool, optional + Whether to use weight normalization in all convolutional layers, + by default True + use_causal_conv : bool, optional + Whether to use causal convolution in residual blocks, by default False + nonlinear_activation : str, optional + Activation after convolutions other than those in residual blocks, + by default "LeakyReLU" + nonlinear_activation_params : Dict[str, Any], optional + Parameters to pass to the activation, by default {"negative_slope": 0.2} + """ + def __init__(self, - in_channels=1, - out_channels=1, - kernel_size=3, - layers=30, - stacks=3, - residual_channels=64, - gate_channels=128, - skip_channels=64, - dropout=0., - bias=True, - use_weight_norm=True, - use_causal_conv=False, - nonlinear_activation="LeakyReLU", - nonlinear_activation_params={"negative_slope": 0.2}): + in_channels: int=1, + out_channels: int=1, + kernel_size: int=3, + layers: int=30, + stacks: int=3, + residual_channels: int=64, + gate_channels: int=128, + skip_channels: int=64, + dropout: float=0., + bias: bool=True, + use_weight_norm: bool=True, + use_causal_conv: bool=False, + nonlinear_activation: str="LeakyReLU", + nonlinear_activation_params: Dict[ + str, Any]={"negative_slope": 0.2}): super().__init__() assert kernel_size % 2 == 1 self.in_channels = in_channels @@ -692,7 +730,18 @@ class ResidualPWGDiscriminator(nn.Layer): if use_weight_norm: self.apply_weight_norm() - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: + """ + Parameters + ---------- + x : Tensor + Shape (N, in_channels, num_samples), the input audio. + + Returns + ------- + Tensor + Shape (N, out_channels, num_samples), the predicted logits. + """ x = self.first_conv(x) skip = 0 for f in self.conv_layers: From a738954001a2514bedbcf135fc73ce2ea80d0c36 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Fri, 18 Jun 2021 09:44:32 +0000 Subject: [PATCH 31/47] 1. change default data layout to channel last in preprocessing; 2. add Summary and DictSummary for aggrelation of evaluation losses; 3. add unittest for report ans scope. --- examples/parallelwave_gan/baker/batch_fn.py | 16 +-- examples/parallelwave_gan/baker/train.py | 33 +++--- parakeet/modules/stft_loss.py | 11 +- parakeet/training/reporter.py | 112 ++++++++++++++++++++ tests/test_reporter.py | 51 +++++++++ 5 files changed, 197 insertions(+), 26 deletions(-) create mode 100644 tests/test_reporter.py diff --git a/examples/parallelwave_gan/baker/batch_fn.py b/examples/parallelwave_gan/baker/batch_fn.py index be22dbb..aff647c 100644 --- a/examples/parallelwave_gan/baker/batch_fn.py +++ b/examples/parallelwave_gan/baker/batch_fn.py @@ -50,7 +50,8 @@ class Clip(object): """Convert into batch tensors. Args: - batch (list): list of tuple of the pair of audio and features. + batch (list): list of tuple of the pair of audio and features. Audio shape + (T, ), features shape(T', C). Returns: Tensor: Auxiliary feature batch (B, C, T'), where @@ -60,13 +61,13 @@ class Clip(object): """ # check length examples = [ - self._adjust_length(b['wave_path'], b['feats_path']) - for b in examples if b['feats_path'].shape[1] > self.mel_threshold + self._adjust_length(b['wave'], b['feats']) for b in examples + if b['feats'].shape[0] > self.mel_threshold ] xs, cs = [b[0] for b in examples], [b[1] for b in examples] # make batch with random cut - c_lengths = [c.shape[1] for c in cs] + c_lengths = [c.shape[0] for c in cs] start_frames = np.array([ np.random.randint(self.start_offset, cl + self.end_offset) for cl in c_lengths @@ -79,12 +80,13 @@ class Clip(object): y_batch = np.stack( [x[start:end] for x, start, end in zip(xs, x_starts, x_ends)]) c_batch = np.stack( - [c[:, start:end] for c, start, end in zip(cs, c_starts, c_ends)]) + [c[start:end] for c, start, end in zip(cs, c_starts, c_ends)]) # convert each batch to tensor, asuume that each item in batch has the same length y_batch = paddle.to_tensor( y_batch, dtype=paddle.float32).unsqueeze(1) # (B, 1, T) - c_batch = paddle.to_tensor(c_batch, dtype=paddle.float32) # (B, C, T') + c_batch = paddle.to_tensor( + c_batch, dtype=paddle.float32).transpose([0, 2, 1]) # (B, C, T') return y_batch, c_batch @@ -103,6 +105,6 @@ class Clip(object): # check the legnth is valid assert len(x) == c.shape[ - 1] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[1]})" + 0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})" return x, c diff --git a/examples/parallelwave_gan/baker/train.py b/examples/parallelwave_gan/baker/train.py index d424c5d..087963e 100644 --- a/examples/parallelwave_gan/baker/train.py +++ b/examples/parallelwave_gan/baker/train.py @@ -20,7 +20,7 @@ import dataclasses from pathlib import Path import yaml -import json +import jsonlines import paddle import numpy as np from paddle import nn @@ -61,23 +61,23 @@ def train_sp(args, config): ) # construct dataset for training and validation - with open(args.train_metadata) as f: - train_metadata = json.load(f) + with jsonlines.open(args.train_metadata, 'r') as reader: + train_metadata = list(reader) train_dataset = DataTable( data=train_metadata, - fields=["wave_path", "feats_path"], + fields=["wave", "feats"], converters={ - "wave_path": np.load, - "feats_path": np.load, + "wave": np.load, + "feats": np.load, }, ) - with open(args.dev_metadata) as f: - dev_metadata = json.load(f) + with jsonlines.open(args.dev_metadata, 'r') as reader: + dev_metadata = list(reader) dev_dataset = DataTable( data=dev_metadata, - fields=["wave_path", "feats_path"], + fields=["wave", "feats"], converters={ - "wave_path": np.load, - "feats_path": np.load, + "wave": np.load, + "feats": np.load, }, ) # collate function and dataloader @@ -169,12 +169,13 @@ def train_sp(args, config): trainer = Trainer( updater, - stop_trigger=(10, "iteration"), # PROFILING + stop_trigger=(config.train_max_steps, "iteration"), # PROFILING out=output_dir, ) - with paddle.fluid.profiler.profiler('All', 'total', - str(output_dir / "profiler.log"), - 'Default') as prof: - trainer.run() + + # with paddle.fluid.profiler.profiler('All', 'total', + # str(output_dir / "profiler.log"), + # 'Default') as prof: + trainer.run() def main(): diff --git a/parakeet/modules/stft_loss.py b/parakeet/modules/stft_loss.py index 6531010..f98a3dd 100644 --- a/parakeet/modules/stft_loss.py +++ b/parakeet/modules/stft_loss.py @@ -35,8 +35,9 @@ class SpectralConvergenceLoss(nn.Layer): Tensor: Spectral convergence loss value. """ return paddle.norm( - y_mag - x_mag, p="fro") / paddle.norm( - y_mag, p="fro") + y_mag - x_mag, p="fro") / paddle.clip( + paddle.norm( + y_mag, p="fro"), min=1e-10) class LogSTFTMagnitudeLoss(nn.Layer): @@ -54,7 +55,11 @@ class LogSTFTMagnitudeLoss(nn.Layer): Returns: Tensor: Log STFT magnitude loss value. """ - return F.l1_loss(paddle.log(y_mag), paddle.log(x_mag)) + return F.l1_loss( + paddle.log(paddle.clip( + y_mag, min=1e-10)), + paddle.log(paddle.clip( + x_mag, min=1e-10))) class STFTLoss(nn.Layer): diff --git a/parakeet/training/reporter.py b/parakeet/training/reporter.py index 3f4d77f..c2f171c 100644 --- a/parakeet/training/reporter.py +++ b/parakeet/training/reporter.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import math import contextlib +from collections import defaultdict OBSERVATIONS = None @@ -45,3 +47,113 @@ def report(name, value): return else: observations[name] = value + + +class Summary(object): + """Online summarization of a sequence of scalars. + Summary computes the statistics of given scalars online. + """ + + def __init__(self): + self._x = 0.0 + self._x2 = 0.0 + self._n = 0 + + def add(self, value, weight=1): + """Adds a scalar value. + + Args: + value: Scalar value to accumulate. It is either a NumPy scalar or + a zero-dimensional array (on CPU or GPU). + weight: An optional weight for the value. It is a NumPy scalar or + a zero-dimensional array (on CPU or GPU). + Default is 1 (integer). + + """ + self._x += weight * value + self._x2 += weight * value * value + self._n += weight + + def compute_mean(self): + """Computes the mean.""" + x, n = self._x, self._n + return x / n + + def make_statistics(self): + """Computes and returns the mean and standard deviation values. + + Returns: + tuple: Mean and standard deviation values. + + """ + x, n = self._x, self._n + mean = x / n + var = self._x2 / n - mean * mean + std = math.sqrt(var) + return mean, std + + +class DictSummary(object): + """Online summarization of a sequence of dictionaries. + + ``DictSummary`` computes the statistics of a given set of scalars online. + It only computes the statistics for scalar values and variables of scalar + values in the dictionaries. + + """ + + def __init__(self): + self._summaries = defaultdict(Summary) + + def add(self, d): + """Adds a dictionary of scalars. + + Args: + d (dict): Dictionary of scalars to accumulate. Only elements of + scalars, zero-dimensional arrays, and variables of + zero-dimensional arrays are accumulated. When the value + is a tuple, the second element is interpreted as a weight. + + """ + summaries = self._summaries + for k, v in d.items(): + w = 1 + if isinstance(v, tuple): + w = v[1] + v = v[0] + summaries[k].add(v, weight=w) + + def compute_mean(self): + """Creates a dictionary of mean values. + + It returns a single dictionary that holds a mean value for each entry + added to the summary. + + Returns: + dict: Dictionary of mean values. + + """ + return { + name: summary.compute_mean() + for name, summary in self._summaries.items() + } + + def make_statistics(self): + """Creates a dictionary of statistics. + + It returns a single dictionary that holds mean and standard deviation + values for every entry added to the summary. For an entry of name + ``'key'``, these values are added to the dictionary by names ``'key'`` + and ``'key.std'``, respectively. + + Returns: + dict: Dictionary of statistics of all entries. + + """ + stats = {} + for name, summary in self._summaries.items(): + mean, std = summary.make_statistics() + stats[name] = mean + stats[name + '.std'] = std + + return stats diff --git a/tests/test_reporter.py b/tests/test_reporter.py new file mode 100644 index 0000000..cd40364 --- /dev/null +++ b/tests/test_reporter.py @@ -0,0 +1,51 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from parakeet.training.reporter import report, scope +from parakeet.training.reporter import Summary, DictSummary + + +def test_reporter_scope(): + first = {} + second = {} + third = {} + + with scope(first): + report("first_begin", 1) + with scope(second): + report("second_begin", 2) + with scope(third): + report("third_begin", 3) + report("third_end", 4) + report("seconf_end", 5) + report("first_end", 6) + + assert first == {'first_begin': 1, 'first_end': 6} + assert second == {'second_begin': 2, 'seconf_end': 5} + assert third == {'third_begin': 3, 'third_end': 4} + print(first) + print(second) + print(third) + + +def test_summary(): + summary = Summary() + summary.add(1) + summary.add(2) + summary.add(3) + state = summary.make_statistics() + print(state) + np.testing.assert_allclose( + np.array(list(state)), np.array([2.0, np.std([1, 2, 3])])) From 83c9f0aeae9bd0046b222fbd8b8fe1cd7e22b3e4 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Mon, 21 Jun 2021 09:56:26 +0000 Subject: [PATCH 32/47] add snapshot and visualizer --- examples/parallelwave_gan/baker/normalize.py | 9 - .../parallelwave_gan/baker/pwg_updater.py | 147 +++++--- examples/parallelwave_gan/baker/train.py | 48 ++- parakeet/training/extensions/evaluator.py | 63 ++++ parakeet/training/extensions/snapshot.py | 44 +++ parakeet/training/extensions/visualizer.py | 34 ++ parakeet/training/seeding.py | 26 ++ parakeet/training/trainer.py | 9 +- parakeet/training/updater.py | 165 +++++++-- parakeet/utils/profile.py | 5 +- parakeet/utils/timeline.py | 319 ++++++++++++++++++ 11 files changed, 764 insertions(+), 105 deletions(-) create mode 100644 parakeet/training/extensions/evaluator.py create mode 100644 parakeet/training/extensions/snapshot.py create mode 100644 parakeet/training/extensions/visualizer.py create mode 100644 parakeet/training/seeding.py create mode 100644 parakeet/utils/timeline.py diff --git a/examples/parallelwave_gan/baker/normalize.py b/examples/parallelwave_gan/baker/normalize.py index 6134917..0cf2841 100644 --- a/examples/parallelwave_gan/baker/normalize.py +++ b/examples/parallelwave_gan/baker/normalize.py @@ -11,12 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Copyright 2019 Tomoki Hayashi -# MIT License (https://opensource.org/licenses/MIT) """Normalize feature files and dump them.""" import argparse @@ -28,13 +22,10 @@ from pathlib import Path import numpy as np import yaml import jsonlines - from sklearn.preprocessing import StandardScaler from tqdm import tqdm from parakeet.datasets.data_table import DataTable -from parakeet.utils.h5_utils import read_hdf5 -from parakeet.utils.h5_utils import write_hdf5 from config import get_cfg_default diff --git a/examples/parallelwave_gan/baker/pwg_updater.py b/examples/parallelwave_gan/baker/pwg_updater.py index 29f313b..bd7dbeb 100644 --- a/examples/parallelwave_gan/baker/pwg_updater.py +++ b/examples/parallelwave_gan/baker/pwg_updater.py @@ -12,36 +12,45 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging +from typing import Dict + import paddle -from paddle.fluid.core import _cuda_synchronize +from paddle.nn import Layer +from paddle.optimizer import Optimizer +from paddle.optimizer.lr import LRScheduler +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler from timer import timer from parakeet.datasets.data_table import DataTable -from parakeet.training.updater import UpdaterBase, UpdaterState +from parakeet.training.updater import UpdaterBase, UpdaterState, StandardUpdater +from parakeet.training.extensions.evaluator import StandardEvaluator from parakeet.training.trainer import Trainer from parakeet.training.reporter import report from parakeet.training.checkpoint import KBest, KLatest from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator from parakeet.modules.stft_loss import MultiResolutionSTFTLoss +from parakeet.utils.profile import synchronize -class PWGUpdater(UpdaterBase): +class PWGUpdater(StandardUpdater): def __init__( self, - models, - optimizers, - criterions, - schedulers, - dataloaders, + models: Dict[str, Layer], + optimizers: Dict[str, Optimizer], + criterions: Dict[str, Layer], + schedulers: Dict[str, LRScheduler], + dataloader: DataLoader, discriminator_train_start_steps: int, lambda_adv: float, ): self.models = models - self.generator = models['generator'] - self.discriminator = models['discriminator'] + self.generator: Layer = models['generator'] + self.discriminator: Layer = models['discriminator'] self.optimizers = optimizers - self.optimizer_g = optimizers['generator'] - self.optimizer_d = optimizers['discriminator'] + self.optimizer_g: Optimizer = optimizers['generator'] + self.optimizer_d: Optimizer = optimizers['discriminator'] self.criterions = criterions self.criterion_stft = criterions['stft'] @@ -51,46 +60,34 @@ class PWGUpdater(UpdaterBase): self.scheduler_g = schedulers['generator'] self.scheduler_d = schedulers['discriminator'] - self.dataloaders = dataloaders - self.train_dataloader = dataloaders['train'] - self.dev_dataloader = dataloaders['dev'] + self.dataloader = dataloader self.discriminator_train_start_steps = discriminator_train_start_steps self.lambda_adv = lambda_adv self.state = UpdaterState(iteration=0, epoch=0) - self.train_iterator = iter(self.train_dataloader) - - def update_core(self): - place = paddle.fluid.framework._current_expected_place() - with timer() as t: - _cuda_synchronize(place) - try: - batch = next(self.train_iterator) - except StopIteration: - self.train_iterator = iter(self.train_dataloader) - batch = next(self.train_iterator) - _cuda_synchronize(place) - print(f"Loading a batch takes {t.elapse}s") + self.train_iterator = iter(self.dataloader) + def update_core(self, batch): + # parse batch wav, mel = batch # Generator noise = paddle.randn(wav.shape) - _cuda_synchronize(place) + synchronize() with timer() as t: wav_ = self.generator(noise, mel) - _cuda_synchronize(place) - print(f"Generator takes {t.elapse}s") + synchronize() + logging.debug(f"Generator takes {t.elapse}s.") ## Multi-resolution stft loss - _cuda_synchronize(place) + synchronize() with timer() as t: sc_loss, mag_loss = self.criterion_stft( wav_.squeeze(1), wav.squeeze(1)) - _cuda_synchronize(place) - print(f"Multi-resolution STFT loss takes {t.elapse}s") + synchronize() + logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s.") report("train/spectral_convergence_loss", float(sc_loss)) report("train/log_stft_magnitude_loss", float(mag_loss)) @@ -98,30 +95,31 @@ class PWGUpdater(UpdaterBase): ## Adversarial loss if self.state.iteration > self.discriminator_train_start_steps: + synchronize() with timer() as t: - _cuda_synchronize(place) p_ = self.discriminator(wav_) adv_loss = self.criterion_mse(p_, paddle.ones_like(p_)) - _cuda_synchronize(place) - print(f"Discriminator and adversarial loss takes {t.elapse}s") + synchronize() + logging.debug( + f"Discriminator and adversarial loss takes {t.elapse}s") report("train/adversarial_loss", float(adv_loss)) gen_loss += self.lambda_adv * adv_loss report("train/generator_loss", float(gen_loss)) - _cuda_synchronize(place) + synchronize() with timer() as t: self.optimizer_g.clear_grad() gen_loss.backward() - _cuda_synchronize(place) - print(f"Backward takes {t.elapse}s.") + synchronize() + logging.debug(f"Backward takes {t.elapse}s.") - _cuda_synchronize(place) + synchronize() with timer() as t: self.optimizer_g.step() self.scheduler_g.step() - _cuda_synchronize(place) - print(f"Update takes {t.elapse}s.") + synchronize() + logging.debug(f"Update takes {t.elapse}s.") # Disctiminator if self.state.iteration > self.discriminator_train_start_steps: @@ -138,5 +136,68 @@ class PWGUpdater(UpdaterBase): self.optimizer_d.clear_grad() dis_loss.backward() + self.optimizer_d.step() self.scheduler_d.step() + + +class PWGEvaluator(StandardEvaluator): + def __init__(self, models, criterions, dataloader, lambda_adv): + self.models = models + self.generator = models['generator'] + self.discriminator = models['discriminator'] + + self.criterions = criterions + self.criterion_stft = criterions['stft'] + self.criterion_mse = criterions['mse'] + + self.dataloader = dataloader + self.lambda_adv = lambda_adv + + def evaluate_core(self, batch): + logging.debug("Evaluate: ") + wav, mel = batch + noise = paddle.randn(wav.shape) + + synchronize() + with timer() as t: + wav_ = self.generator(noise, mel) + synchronize() + logging.debug(f"Generator takes {t.elapse}s") + + ## Multi-resolution stft loss + synchronize() + with timer() as t: + sc_loss, mag_loss = self.criterion_stft( + wav_.squeeze(1), wav.squeeze(1)) + synchronize() + logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s") + + report("eval/spectral_convergence_loss", float(sc_loss)) + report("eval/log_stft_magnitude_loss", float(mag_loss)) + gen_loss = sc_loss + mag_loss + + ## Adversarial loss + synchronize() + with timer() as t: + p_ = self.discriminator(wav_) + adv_loss = self.criterion_mse(p_, paddle.ones_like(p_)) + synchronize() + logging.debug( + f"Discriminator and adversarial loss takes {t.elapse}s") + report("eval/adversarial_loss", float(adv_loss)) + gen_loss += self.lambda_adv * adv_loss + + report("eval/generator_loss", float(gen_loss)) + + # Disctiminator + with paddle.no_grad(): + wav_ = self.generator(noise, mel) + p = self.discriminator(wav) + p_ = self.discriminator(wav_.detach()) + real_loss = self.criterion_mse(p, paddle.ones_like(p)) + fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_)) + report("eval/real_loss", float(real_loss)) + report("eval/fake_loss", float(fake_loss)) + dis_loss = real_loss + fake_loss + report("eval/discriminator_loss", float(dis_loss)) diff --git a/examples/parallelwave_gan/baker/train.py b/examples/parallelwave_gan/baker/train.py index 087963e..2854b66 100644 --- a/examples/parallelwave_gan/baker/train.py +++ b/examples/parallelwave_gan/baker/train.py @@ -39,10 +39,13 @@ from parakeet.training.reporter import report from parakeet.training.checkpoint import KBest, KLatest from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator from parakeet.modules.stft_loss import MultiResolutionSTFTLoss +from parakeet.training.extensions.visualizer import VisualDL +from parakeet.training.extensions.snapshot import Snapshot +from parakeet.training.seeding import seed_everything from batch_fn import Clip from config import get_cfg_default -from pwg_updater import PWGUpdater +from pwg_updater import PWGUpdater, PWGEvaluator def train_sp(args, config): @@ -56,6 +59,9 @@ def train_sp(args, config): if world_size > 1: paddle.distributed.init_parallel_env() + # set the random seed, it is a must for multiprocess training + seed_everything(42) + print( f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", ) @@ -128,8 +134,7 @@ def train_sp(args, config): parameters=generator.parameters(), **config["generator_optimizer_params"]) lr_schedule_d = StepDecay(**config["discriminator_scheduler_params"]) - gradient_clip_d = nn.ClipGradByGlobalNorm(config[ - "discriminator_grad_norm"]) + gradient_clip_d = nn.ClipGradByGlobalNorm(config["discriminator_grad_norm"]) optimizer_d = Adam( learning_rate=lr_schedule_d, grad_clip=gradient_clip_d, @@ -138,10 +143,10 @@ def train_sp(args, config): print("optimizers done!") output_dir = Path(args.output_dir) - log_writer = None + checkpoint_dir = output_dir / "checkpoints" if dist.get_rank() == 0: output_dir.mkdir(parents=True, exist_ok=True) - log_writer = LogWriter(str(output_dir)) + checkpoint_dir.mkdir(parents=True, exist_ok=True) updater = PWGUpdater( models={ @@ -160,18 +165,41 @@ def train_sp(args, config): "generator": lr_schedule_g, "discriminator": lr_schedule_d, }, - dataloaders={ - "train": train_dataloader, - "dev": dev_dataloader, - }, + dataloader=train_dataloader, discriminator_train_start_steps=config.discriminator_train_start_steps, lambda_adv=config.lambda_adv, ) + evaluator = PWGEvaluator( + models={ + "generator": generator, + "discriminator": discriminator, + }, + criterions={ + "stft": criterion_stft, + "mse": criterion_mse, + }, + dataloader=dev_dataloader, + lambda_adv=config.lambda_adv, ) + trainer = Trainer( updater, - stop_trigger=(config.train_max_steps, "iteration"), # PROFILING + stop_trigger=(config.train_max_steps, "iteration"), out=output_dir, ) + trainer.extend( + evaluator, + trigger=(config.eval_interval_steps, 'iteration'), + priority=3) + if dist.get_rank() == 0: + log_writer = LogWriter(str(output_dir)) + trainer.extend( + VisualDL(log_writer), trigger=(1, 'iteration'), priority=1) + trainer.extend( + Snapshot(checkpoint_dir), + trigger=(config.save_interval_steps, 'iteration'), + priority=2) + print("Trainer Done!") + # with paddle.fluid.profiler.profiler('All', 'total', # str(output_dir / "profiler.log"), # 'Default') as prof: diff --git a/parakeet/training/extensions/evaluator.py b/parakeet/training/extensions/evaluator.py new file mode 100644 index 0000000..eb3f877 --- /dev/null +++ b/parakeet/training/extensions/evaluator.py @@ -0,0 +1,63 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from typing import Optional, Callable, Dict + +from tqdm import tqdm +import paddle +from paddle import Tensor +from paddle.nn import Layer +from paddle.io import DataLoader + +from parakeet.training.reporter import scope, report, DictSummary + + +class StandardEvaluator(object): + def __init__(self, model: Layer, dataloader: DataLoader): + # it is designed to hold multiple models + models = {"main": model} + self.models: Dict[str, Layer] = models + self.model = model + + # dataloaders + self.dataloader = dataloader + + def evaluate_core(self, batch): + # compute + self.model(batch) # you may report here + + def evaluate(self): + # switch to eval mode + for layer in self.models.values(): + layer.eval() + + summary = DictSummary() + for batch in self.dataloader: + observation = {} + with scope(observation): + with paddle.no_grad(): + self.evaluate_core( + batch) # main evaluation computation here. + summary.add(observation) + summary = summary.compute_mean() + return summary + + def __call__(self, trainer=None): + self.observation = {} + with scope(self.observation): + summary = self.evaluate() + for k, v in summary.items(): + report(k, v) + print(self.observation) diff --git a/parakeet/training/extensions/snapshot.py b/parakeet/training/extensions/snapshot.py new file mode 100644 index 0000000..e31403b --- /dev/null +++ b/parakeet/training/extensions/snapshot.py @@ -0,0 +1,44 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union +from pathlib import Path + +from parakeet.utils.mp_tools import rank_zero_only +from parakeet.training.trainer import Trainer + + +class Snapshot(object): + """An extension to make snapshot of the updater object inside + the trainer. It is done by calling the updater's `save` method. + + An Updater save its state_dict by default, which contains the + updater state, (i.e. epoch and iteration) and all the model + parameters and optimizer states. If the updater inside the trainer + subclasses StandardUpdater, everything is good to go. + + Parameters + ---------- + checkpoint_dir : Union[str, Path] + The directory to save checkpoints into. + """ + + def __init__(self, checkpoint_dir: Union[str, Path]): + self.checkpoint_dir = Path(checkpoint_dir) + + @rank_zero_only + def __call__(self, trainer: Trainer): + iteration = trainer.updater.state.iteration + path = self.checkpoint_dir / f"step_{iteration}.pdz" + trainer.updater.save(str(path)) diff --git a/parakeet/training/extensions/visualizer.py b/parakeet/training/extensions/visualizer.py new file mode 100644 index 0000000..9a1bcbb --- /dev/null +++ b/parakeet/training/extensions/visualizer.py @@ -0,0 +1,34 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from visualdl import LogWriter +from parakeet.training.trainer import Trainer +from parakeet.utils.mp_tools import rank_zero_only + + +class VisualDL(object): + """A wrapper of visualdl log writer. It assumes that the metrics to be visualized + are all scalars which are recorded into the `.observation` dictionary of the + trainer object. The dictionary is created for each step, thus the visualdl log + writer uses the iteration from the updater's `iteration` as the global step to + add records. + """ + + def __init__(self, writer: LogWriter): + self.writer = writer + + @rank_zero_only + def __call__(self, trainer: Trainer): + for k, v in trainer.observation.items(): + self.writer.add_scalar(k, v, step=trainer.updater.state.iteration) diff --git a/parakeet/training/seeding.py b/parakeet/training/seeding.py new file mode 100644 index 0000000..1a6660f --- /dev/null +++ b/parakeet/training/seeding.py @@ -0,0 +1,26 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +import paddle +import random +import numpy as np + + +def seed_everything(seed: int): + paddle.seed(seed) + random.seed(seed) + np.random.seed(seed) + logging.debug(f"Set the seed of paddle, random, np.random to {seed}.") diff --git a/parakeet/training/trainer.py b/parakeet/training/trainer.py index ad3661f..99e5114 100644 --- a/parakeet/training/trainer.py +++ b/parakeet/training/trainer.py @@ -40,15 +40,13 @@ class Trainer(object): self.out = Path(out) self.observation = {} - def setup(self): - pass - def extend(self, extension, name=None, trigger=None, priority=None): trigger = get_trigger(trigger) ordinal = 0 modified_name = name - while name in self.extensions: + while modified_name in self.extensions: + print(self.extensions.keys()) ordinal += 1 modified_name = f"{name}_{ordinal}" @@ -61,8 +59,7 @@ class Trainer(object): self.extensions.keys(), key=lambda name: self.extensions[name].priority, reverse=True) - extensions = [(name, self.extensions[name]) - for name in extension_order] + extensions = [(name, self.extensions[name]) for name in extension_order] update = self.updater.update stop_trigger = self.stop_trigger diff --git a/parakeet/training/updater.py b/parakeet/training/updater.py index c8383f2..cb2213c 100644 --- a/parakeet/training/updater.py +++ b/parakeet/training/updater.py @@ -12,12 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging from dataclasses import dataclass from typing import Optional +from typing import Dict +from typing import Union +from timer import timer +import paddle +from paddle import Tensor from paddle.nn import Layer from paddle.optimizer import Optimizer from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler + +from parakeet.training.reporter import report @dataclass @@ -56,12 +65,34 @@ class UpdaterBase(object): So the best practice is to define a model and define a updater for it. """ - def update(self): - self.state.iteration += 1 - self.update_core() + def __init__(self, init_state=None): + if init_state is None: + self.state = UpdaterState() + else: + self.state = init_state - def update_core(self): - pass + def update(self, batch): + raise NotImplementedError( + "Implement your own `update` method for training a step.") + + def state_dict(self): + state_dict = { + "epoch": self.state.epoch, + "iteration": self.state.iteration, + } + return state_dict + + def set_state_dict(self, state_dict): + self.state.epoch = state_dict["epoch"] + self.state.iteration = state_dict["iteration"] + + def save(self, path): + archive = self.state_dict() + paddle.save(archive, path) + + def load(self, path): + archive = paddle.load(path) + self.set_state_dict(archive) class StandardUpdater(UpdaterBase): @@ -71,54 +102,116 @@ class StandardUpdater(UpdaterBase): def __init__(self, model: Layer, - dataloader: DataLoader, optimizer: Optimizer, - loss_func=None, - auto_new_epoch: bool=True, + dataloader: DataLoader, init_state: Optional[UpdaterState]=None): + # it is designed to hold multiple models + models = {"main": model} + self.models: Dict[str, Layer] = models self.model = model - self.dataloader = dataloader - self.optimizer = optimizer - self.loss_func = loss_func - self.auto_new_epoch = auto_new_epoch - self.iterator = iter(dataloader) + # it is designed to hold multiple optimizers + optimizers = {"main": optimizer} + self.optimizer = optimizer + self.optimizers: Dict[str, Optimizer] = optimizers + + # dataloaders + self.dataloader = dataloader + + # init state if init_state is None: self.state = UpdaterState() else: self.state = init_state + self.train_iterator = iter(dataloader) + def update(self): - self.update_core() self.state.iteration += 1 + # switch to training mode + for layer in self.models.values(): + layer.train() + + # training for a step is implemented here + batch = self.read_batch() + self.update_core(batch) + + def update_core(self, batch): + """A simple case for a training step. Basic assumptions are: + Single model; + Single optimizer; + A batch from the dataloader is just the input of the model; + The model return a single loss, or a dict containing serval losses. + Parameters updates at every batch, no gradient accumulation. + """ + loss = self.model(*batch) + + if isinstance(loss, Tensor): + loss_dict = {"main": loss} + else: + # Dict[str, Tensor] + loss_dict = loss + if "main" not in loss_dict: + main_loss = 0 + for loss_item in loss.values(): + main_loss += loss_item + loss_dict["main"] = main_loss + + for name, loss_item in loss_dict.items(): + report(name, float(loss_item)) + + self.optimizer.clear_gradient() + loss_dict["main"].backward() + self.optimizer.update() + def new_epoch(self): - self.iterator = iter(self.dataloader) + """Start a new epoch.""" self.state.epoch += 1 - def update_core(self): - model = self.model - optimizer = self.optimizer - loss_func = self.loss_func + # NOTE: all batch sampler for distributed training should + # subclass DistributedBatchSampler and implement `set_epoch` method + batch_sampler = self.dataloader.batch_sampler + if isinstance(batch_sampler, DistributedBatchSampler): + batch_sampler.set_epoch(self.state.epoch) + self.train_iterator = iter(self.dataloader) - model.train() - optimizer.clear_grad() - - # fetch a batch - try: - batch = next(self.iterator) - except StopIteration as e: - if self.auto_new_epoch: + def read_batch(self): + """Read a batch from the data loader, auto renew when data is exhausted.""" + with timer() as t: + try: + batch = next(self.train_iterator) + except StopIteration: self.new_epoch() + batch = next(self.train_iterator) + logging.debug( + f"Read a batch takes {t.elapse}s.") # replace it with logging + return batch - # forward - if self.loss_func is not None: - loss = loss_func(batch) - else: - loss = model(batch) + def state_dict(self): + """State dict of a Updater, model, optimizer and updater state are included.""" + state_dict = super().state_dict() + for name, layer in self.models.items(): + state_dict[f"{name}_params"] = layer.state_dict() + for name, optim in self.optimizers.items(): + state_dict[f"{name}_optimizer"] = optim.state_dict() + return state_dict - # backward - loss.backward() + def set_state_dict(self, state_dict): + """Set state dict for a Updater. Parameters of models, states for + optimizers and UpdaterState are restored.""" + for name, layer in self.models.items(): + layer.set_state_dict(state_dict[f"{name}_params"]) + for name, optim in self.optimizers.items(): + optim.set_state_dict(state_dict[f"{name}_optimizer"]) + super().set_state_dict(state_dict) - # update parameters - optimizer.step() + def save(self, path): + """Save Updater state dict.""" + archive = self.state_dict() + paddle.save(archive, path) + + def load(self, path): + """Load Updater state dict.""" + archive = paddle.load(path) + self.set_state_dict(archive) diff --git a/parakeet/utils/profile.py b/parakeet/utils/profile.py index 1e246eb..29007a9 100644 --- a/parakeet/utils/profile.py +++ b/parakeet/utils/profile.py @@ -13,8 +13,11 @@ # limitations under the License. import paddle +from paddle.framework import CUDAPlace def synchronize(): + """Trigger cuda synchronization for better timing.""" place = paddle.fluid.framework._current_expected_place() - paddle.fluid.core._cuda_synchronize(place) + if isinstance(place, CUDAPlace): + paddle.fluid.core._cuda_synchronize(place) diff --git a/parakeet/utils/timeline.py b/parakeet/utils/timeline.py new file mode 100644 index 0000000..2a399b7 --- /dev/null +++ b/parakeet/utils/timeline.py @@ -0,0 +1,319 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import six +import sys +import unittest + +import google.protobuf.text_format as text_format +import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2 + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + '--profile_path', + type=str, + default='', + help='Input profile file name. If there are multiple file, the format ' + 'should be trainer1=file1,trainer2=file2,ps=file3') +parser.add_argument( + '--timeline_path', type=str, default='', help='Output timeline file name.') +args = parser.parse_args() + + +class _ChromeTraceFormatter(object): + def __init__(self): + self._events = [] + self._metadata = [] + + def _create_event(self, ph, category, name, pid, tid, timestamp): + """Creates a new Chrome Trace event. + + For details of the file format, see: + https://github.com/catapult-project/catapult/blob/master/tracing/README.md + + Args: + ph: The type of event - usually a single character. + category: The event category as a string. + name: The event name as a string. + pid: Identifier of the process generating this event as an integer. + tid: Identifier of the thread generating this event as an integer. + timestamp: The timestamp of this event as a long integer. + + Returns: + A JSON compatible event object. + """ + event = {} + event['ph'] = ph + event['cat'] = category + event['name'] = name.replace("ParallelExecutor::Run/", "") + event['pid'] = pid + event['tid'] = tid + event['ts'] = timestamp + return event + + def emit_pid(self, name, pid): + """Adds a process metadata event to the trace. + + Args: + name: The process name as a string. + pid: Identifier of the process as an integer. + """ + event = {} + event['name'] = 'process_name' + event['ph'] = 'M' + event['pid'] = pid + event['args'] = {'name': name} + self._metadata.append(event) + + def emit_region(self, timestamp, duration, pid, tid, category, name, args): + """Adds a region event to the trace. + + Args: + timestamp: The start timestamp of this region as a long integer. + duration: The duration of this region as a long integer. + pid: Identifier of the process generating this event as an integer. + tid: Identifier of the thread generating this event as an integer. + category: The event category as a string. + name: The event name as a string. + args: A JSON-compatible dictionary of event arguments. + """ + event = self._create_event('X', category, name, pid, tid, timestamp) + event['dur'] = duration + event['args'] = args + self._events.append(event) + + def emit_counter(self, category, name, pid, timestamp, counter, value): + """Emits a record for a single counter. + + Args: + category: The event category as string + name: The event name as string + pid: Identifier of the process generating this event as integer + timestamp: The timestamps of this event as long integer + counter: Name of the counter as string + value: Value of the counter as integer + tid: Thread id of the allocation as integer + """ + event = self._create_event('C', category, name, pid, 0, timestamp) + event['args'] = {counter: value} + self._events.append(event) + + def format_to_string(self, pretty=False): + """Formats the chrome trace to a string. + + Args: + pretty: (Optional.) If True, produce human-readable JSON output. + + Returns: + A JSON-formatted string in Chrome Trace format. + """ + trace = {} + trace['traceEvents'] = self._metadata + self._events + if pretty: + return json.dumps(trace, indent=4, separators=(',', ': ')) + else: + return json.dumps(trace, separators=(',', ':')) + + +class Timeline(object): + def __init__(self, profile_dict): + self._profile_dict = profile_dict + self._pid = 0 + self._devices = dict() + self._mem_devices = dict() + self._chrome_trace = _ChromeTraceFormatter() + + def _allocate_pid(self): + cur_pid = self._pid + self._pid += 1 + return cur_pid + + def _allocate_pids(self): + for k, profile_pb in six.iteritems(self._profile_dict): + for event in profile_pb.events: + if event.type == profiler_pb2.Event.CPU: + if (k, event.device_id, "CPU") not in self._devices: + pid = self._allocate_pid() + self._devices[(k, event.device_id, "CPU")] = pid + # -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy) + if event.device_id == -1: + self._chrome_trace.emit_pid("%s:cuda_api" % k, pid) + else: + self._chrome_trace.emit_pid( + "%s:cpu:block:%d" % (k, event.device_id), pid) + elif event.type == profiler_pb2.Event.GPUKernel: + if (k, event.device_id, "GPUKernel") not in self._devices: + pid = self._allocate_pid() + self._devices[(k, event.device_id, "GPUKernel")] = pid + self._chrome_trace.emit_pid("%s:gpu:%d" % + (k, event.device_id), pid) + if not hasattr(profile_pb, "mem_events"): + continue + for mevent in profile_pb.mem_events: + if mevent.place == profiler_pb2.MemEvent.CUDAPlace: + if (k, mevent.device_id, "GPU") not in self._mem_devices: + pid = self._allocate_pid() + self._mem_devices[(k, mevent.device_id, "GPU")] = pid + self._chrome_trace.emit_pid( + "memory usage on %s:gpu:%d" % (k, mevent.device_id), + pid) + elif mevent.place == profiler_pb2.MemEvent.CPUPlace: + if (k, mevent.device_id, "CPU") not in self._mem_devices: + pid = self._allocate_pid() + self._mem_devices[(k, mevent.device_id, "CPU")] = pid + self._chrome_trace.emit_pid( + "memory usage on %s:cpu:%d" % (k, mevent.device_id), + pid) + elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace: + if (k, mevent.device_id, "CUDAPinnedPlace" + ) not in self._mem_devices: + pid = self._allocate_pid() + self._mem_devices[(k, mevent.device_id, + "CUDAPinnedPlace")] = pid + self._chrome_trace.emit_pid( + "memory usage on %s:cudapinnedplace:%d" % + (k, mevent.device_id), pid) + elif mevent.place == profiler_pb2.MemEvent.NPUPlace: + if (k, mevent.device_id, "NPU") not in self._mem_devices: + pid = self._allocate_pid() + self._mem_devices[(k, mevent.device_id, "NPU")] = pid + self._chrome_trace.emit_pid( + "memory usage on %s:npu:%d" % (k, mevent.device_id), + pid) + if (k, 0, "CPU") not in self._mem_devices: + pid = self._allocate_pid() + self._mem_devices[(k, 0, "CPU")] = pid + self._chrome_trace.emit_pid("memory usage on %s:cpu:%d" % + (k, 0), pid) + if (k, 0, "GPU") not in self._mem_devices: + pid = self._allocate_pid() + self._mem_devices[(k, 0, "GPU")] = pid + self._chrome_trace.emit_pid("memory usage on %s:gpu:%d" % + (k, 0), pid) + if (k, 0, "CUDAPinnedPlace") not in self._mem_devices: + pid = self._allocate_pid() + self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid + self._chrome_trace.emit_pid( + "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid) + if (k, 0, "NPU") not in self._mem_devices: + pid = self._allocate_pid() + self._mem_devices[(k, 0, "NPU")] = pid + self._chrome_trace.emit_pid("memory usage on %s:npu:%d" % + (k, 0), pid) + + def _allocate_events(self): + for k, profile_pb in six.iteritems(self._profile_dict): + for event in profile_pb.events: + if event.type == profiler_pb2.Event.CPU: + type = "CPU" + elif event.type == profiler_pb2.Event.GPUKernel: + type = "GPUKernel" + pid = self._devices[(k, event.device_id, type)] + args = {'name': event.name} + if event.memcopy.bytes > 0: + args['mem_bytes'] = event.memcopy.bytes + if hasattr(event, "detail_info") and event.detail_info: + args['detail_info'] = event.detail_info + # TODO(panyx0718): Chrome tracing only handles ms. However, some + # ops takes micro-seconds. Hence, we keep the ns here. + self._chrome_trace.emit_region( + event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid, + event.sub_device_id, 'Op', event.name, args) + + def _allocate_memory_event(self): + if not hasattr(profiler_pb2, "MemEvent"): + return + place_to_str = { + profiler_pb2.MemEvent.CPUPlace: "CPU", + profiler_pb2.MemEvent.CUDAPlace: "GPU", + profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace", + profiler_pb2.MemEvent.NPUPlace: "NPU" + } + for k, profile_pb in six.iteritems(self._profile_dict): + mem_list = [] + end_profiler = 0 + for mevent in profile_pb.mem_events: + crt_info = dict() + crt_info['time'] = mevent.start_ns + crt_info['size'] = mevent.bytes + if mevent.place in place_to_str: + place = place_to_str[mevent.place] + else: + place = "UnDefine" + crt_info['place'] = place + pid = self._mem_devices[(k, mevent.device_id, place)] + crt_info['pid'] = pid + crt_info['thread_id'] = mevent.thread_id + crt_info['device_id'] = mevent.device_id + mem_list.append(crt_info) + crt_info = dict() + crt_info['place'] = place + crt_info['pid'] = pid + crt_info['thread_id'] = mevent.thread_id + crt_info['device_id'] = mevent.device_id + crt_info['time'] = mevent.end_ns + crt_info['size'] = -mevent.bytes + mem_list.append(crt_info) + end_profiler = max(end_profiler, crt_info['time']) + mem_list.sort(key=lambda tmp: (tmp.get('time', 0))) + i = 0 + total_size = 0 + while i < len(mem_list): + total_size += mem_list[i]['size'] + while i < len(mem_list) - 1 and mem_list[i]['time'] == mem_list[ + i + 1]['time']: + total_size += mem_list[i + 1]['size'] + i += 1 + + self._chrome_trace.emit_counter( + "Memory", "Memory", mem_list[i]['pid'], mem_list[i]['time'], + 0, total_size) + i += 1 + + def generate_chrome_trace(self): + self._allocate_pids() + self._allocate_events() + self._allocate_memory_event() + return self._chrome_trace.format_to_string() + + +profile_path = '/tmp/profile' +if args.profile_path: + profile_path = args.profile_path +timeline_path = '/tmp/timeline' +if args.timeline_path: + timeline_path = args.timeline_path + +profile_paths = profile_path.split(',') +profile_dict = dict() +if len(profile_paths) == 1: + with open(profile_path, 'rb') as f: + profile_s = f.read() + profile_pb = profiler_pb2.Profile() + profile_pb.ParseFromString(profile_s) + profile_dict['trainer'] = profile_pb +else: + for profile_path in profile_paths: + k, v = profile_path.split('=') + with open(v, 'rb') as f: + profile_s = f.read() + profile_pb = profiler_pb2.Profile() + profile_pb.ParseFromString(profile_s) + profile_dict[k] = profile_pb + +tl = Timeline(profile_dict) +with open(timeline_path, 'w') as f: + f.write(tl.generate_chrome_trace()) From b5f99a925fed45dbeb857924b6ad111b86f084cf Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Mon, 21 Jun 2021 11:17:01 +0000 Subject: [PATCH 33/47] add numerical testing for PWGGenerator --- tests/test_pwg.py | 46 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/tests/test_pwg.py b/tests/test_pwg.py index 7f2205d..41027b5 100644 --- a/tests/test_pwg.py +++ b/tests/test_pwg.py @@ -66,9 +66,14 @@ def test_convin_upsample_net(): out2.sum().backward() print(f"torch conv_in_upsample_net backward takes {t.elapse}s.") + print("forward check") print(out.numpy()[0]) print(out2.data.cpu().numpy()[0]) + print("backward check") + print(net.conv_in.weight.numpy()[0]) + print(net2.conv_in.weight.data.cpu().numpy()[0]) + def test_residual_block(): net = ResidualBlock(dilation=9) @@ -137,14 +142,19 @@ def test_pwg_generator(): torch.cuda.synchronize() print(f"torch generator backward takes {t.elapse}s.") + print("test forward:") print(out.numpy()[0]) print(out2.data.cpu().numpy()[0]) + + print("test backward:") + print(net.first_conv.weight.numpy()[0]) + print(net2.first_conv.weight.data.cpu().numpy()[0]) # print(out.shape) def test_pwg_discriminator(): net = PWGDiscriminator() - net2 = pwgan.ParallelWaveGANDiscriminator() + net2 = pwgan.ParallelWaveGANDiscriminator().to(device) summary(net) summary(net2) for k, v in net2.named_parameters(): @@ -154,11 +164,39 @@ def test_pwg_discriminator(): else: p.set_value(v.data.cpu().numpy()) x = paddle.randn([4, 1, 180 * 256]) - y = net(x) - y2 = net2(torch.as_tensor(x.numpy())) + + synchronize() + with timer() as t: + y = net(x) + synchronize() + print(f"forward takes {t.elapse}s.") + + synchronize() + with timer() as t: + y.sum().backward() + synchronize() + print(f"backward takes {t.elapse}s.") + + x_torch = torch.as_tensor(x.numpy()).to(device) + torch.cuda.synchronize() + with timer() as t: + y2 = net2(x_torch) + torch.cuda.synchronize() + print(f"forward takes {t.elapse}s.") + + torch.cuda.synchronize() + with timer() as t: + y2.sum().backward() + torch.cuda.synchronize() + print(f"backward takes {t.elapse}s.") + + print("test forward:") print(y.numpy()[0]) print(y2.data.cpu().numpy()[0]) - print(y.shape) + + print("test backward:") + print(net.conv_layers[0].weight.numpy()[0]) + print(net2.conv_layers[0].weight.data.cpu().numpy()[0]) def test_residual_pwg_discriminator(): From ea5cb8e71fd12456fc8d7fad5529df36add2aacd Mon Sep 17 00:00:00 2001 From: liangyunming Date: Wed, 23 Jun 2021 15:18:37 +0800 Subject: [PATCH 34/47] Dynamic to static --- examples/tacotron2/README.md | 4 +-- parakeet/models/waveflow.py | 65 +++++++++++++++++++++--------------- 2 files changed, 41 insertions(+), 28 deletions(-) diff --git a/examples/tacotron2/README.md b/examples/tacotron2/README.md index 81b91fa..a35ecc8 100644 --- a/examples/tacotron2/README.md +++ b/examples/tacotron2/README.md @@ -58,10 +58,10 @@ For more help on arguments ## Synthesis -After training the Tacotron2, spectrogram can be synthesized by running ``synthesis.py``. +After training the Tacotron2, spectrogram can be synthesized by running ``synthesize.py``. ```bash -python synthesis.py \ +python synthesize.py \ --config=${CONFIGPATH} \ --checkpoint_path=${CHECKPOINTPATH} \ --input=${TEXTPATH} \ diff --git a/parakeet/models/waveflow.py b/parakeet/models/waveflow.py index 154b95c..6744f28 100644 --- a/parakeet/models/waveflow.py +++ b/parakeet/models/waveflow.py @@ -44,7 +44,8 @@ def fold(x, n_group): Tensor : [shape=(\*, time_steps // n_group, group)] Folded tensor. """ - *spatial_shape, time_steps = x.shape + spatial_shape = list(x.shape[:-1]) + time_steps = paddle.shape(x)[-1] new_shape = spatial_shape + [time_steps // n_group, n_group] return paddle.reshape(x, new_shape) @@ -232,7 +233,7 @@ class ResidualBlock(nn.Layer): """ if self.training: raise ValueError("Only use start sequence at evaluation mode.") - self._conv_buffer = None + self._conv_buffer = paddle.zeros([1]) # NOTE: call self.conv's weight norm hook expliccitly since # its weight will be visited directly in `add_input` without @@ -263,10 +264,9 @@ class ResidualBlock(nn.Layer): A row of the skip output. """ x_row_in = x_row - if self._conv_buffer is None: + if len(paddle.shape(self._conv_buffer)) == 1: self._init_buffer(x_row) self._update_buffer(x_row) - rw = self.rw x_row = F.conv2d( self._conv_buffer, @@ -275,7 +275,6 @@ class ResidualBlock(nn.Layer): padding=[0, 0, rw // 2, (rw - 1) // 2], dilation=self.dilations) x_row += self.condition_proj(condition_row) - content, gate = paddle.chunk(x_row, 2, axis=1) x_row = paddle.tanh(content) * F.sigmoid(gate) @@ -329,7 +328,7 @@ class ResidualNet(nn.LayerList): if len(dilations_h) != n_layer: raise ValueError( "number of dilations_h should equals num of layers") - super().__init__() + super(ResidualNet, self).__init__() for i in range(n_layer): dilation = (dilations_h[i], 2**i) layer = ResidualBlock(residual_channels, condition_channels, @@ -539,27 +538,21 @@ class Flow(nn.Layer): transformation from x to z. """ z_0 = z[:, :, :1, :] - x = [] - logs_list = [] - b_list = [] - x.append(z_0) + x = paddle.zeros_like(z) + x[:, :, :1, :] = z_0 self._start_sequence() - for i in range(1, self.n_group): - x_row = x[-1] # actuallt i-1:i + + num_step = paddle.ones([1], dtype='int32') * (self.n_group) + for i in range(1, num_step): + x_row = x[:, :, i - 1:i, :] z_row = z[:, :, i:i + 1, :] condition_row = condition[:, :, i:i + 1, :] - x_next_row, (logs, b) = self._inverse_row(z_row, x_row, - condition_row) - x.append(x_next_row) - logs_list.append(logs) - b_list.append(b) - - x = paddle.concat(x, 2) - logs = paddle.concat(logs_list, 2) - b = paddle.concat(b_list, 2) - return x, (logs, b) + condition_row) + x[:, :, i:i+1, :] = x_next_row + + return x class WaveFlow(nn.LayerList): @@ -611,16 +604,18 @@ class WaveFlow(nn.LayerList): perms = [] for i in range(n_flows): if i < n_flows // 2: - perms.append(indices[::-1]) + perm = indices[::-1] else: perm = list(reversed(indices[:half])) + list( reversed(indices[half:])) - perms.append(perm) + perm = paddle.to_tensor(perm) + self.register_buffer(perm.name, perm) + perms.append(perm) return perms def _trim(self, x, condition): assert condition.shape[-1] >= x.shape[-1] - pruned_len = int(x.shape[-1] // self.n_group * self.n_group) + pruned_len = int(paddle.shape(x)[-1] // self.n_group * self.n_group) if x.shape[-1] > pruned_len: x = x[:, :pruned_len] @@ -707,7 +702,7 @@ class WaveFlow(nn.LayerList): for i in reversed(range(self.n_flows)): z = geo.shuffle_dim(z, 2, perm=self.perms[i]) condition = geo.shuffle_dim(condition, 2, perm=self.perms[i]) - z, (logs, b) = self[i].inverse(z, condition) + z = self[i].inverse(z, condition) x = paddle.squeeze(z, 1) # (B, H, W) batch_size = x.shape[0] @@ -893,3 +888,21 @@ class WaveFlowLoss(nn.Layer): ) - log_det_jacobian loss = loss / np.prod(z.shape) return loss + self.const + + +class ConditionalWaveFlow2Infer(ConditionalWaveFlow): + def forward(self, mel): + """Generate raw audio given mel spectrogram. + + Parameters + ---------- + mel : np.ndarray [shape=(C_mel, T_mel)] + Mel spectrogram of an utterance(in log-magnitude). + + Returns + ------- + np.ndarray [shape=(T,)] + The synthesized audio. + """ + audio = self.predict(mel) + return audio From 542bbf6a81c486dde5722090221d59fe256a90f5 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 24 Jun 2021 14:53:38 +0000 Subject: [PATCH 35/47] add profiling tool and a config for parallel training --- .../parallelwave_gan/baker/conf/parallel.yaml | 127 ++++++++++++++++++ parakeet/utils/profile.py | 11 ++ 2 files changed, 138 insertions(+) create mode 100644 examples/parallelwave_gan/baker/conf/parallel.yaml diff --git a/examples/parallelwave_gan/baker/conf/parallel.yaml b/examples/parallelwave_gan/baker/conf/parallel.yaml new file mode 100644 index 0000000..f23b243 --- /dev/null +++ b/examples/parallelwave_gan/baker/conf/parallel.yaml @@ -0,0 +1,127 @@ +# This is the hyperparameter configuration file for Parallel WaveGAN. +# Please make sure this is adjusted for the CSMSC dataset. If you want to +# apply to the other dataset, you might need to carefully change some parameters. +# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN. + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +sr: 24000 # Sampling rate. +n_fft: 2048 # FFT size. +hop_length: 300 # Hop size. +win_length: 1200 # Window length. + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. +n_mels: 80 # Number of mel basis. +fmin: 80 # Minimum freq in mel basis calculation. +fmax: 7600 # Maximum frequency in mel basis calculation. +# global_gain_scale: 1.0 # Will be multiplied to all of waveform. +trim_silence: false # Whether to trim the start and end of silence. +top_db: 60 # Need to tune carefully if the recording is not good. +trim_frame_length: 2048 # Frame size in trimming. +trim_hop_length: 512 # Hop size in trimming. +# format: "npy" # Feature file format. "npy" or "hdf5" is supported. + +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### +generator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_size: 3 # Kernel size of dilated convolution. + layers: 30 # Number of residual block layers. + stacks: 3 # Number of stacks i.e., dilation cycles. + residual_channels: 64 # Number of channels in residual conv. + gate_channels: 128 # Number of channels in gated conv. + skip_channels: 64 # Number of channels in skip conv. + aux_channels: 80 # Number of channels for auxiliary feature conv. + # Must be the same as num_mels. + aux_context_window: 2 # Context window size for auxiliary feature. + # If set to 2, previous 2 and future 2 frames will be considered. + dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. + bias: true # use bias in residual blocks + use_weight_norm: true # Whether to use weight norm. + # If set to true, it will be applied to all of the conv layers. + use_causal_conv: false # use causal conv in residual blocks and upsample layers + # upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture. + upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size. + interpolate_mode: "nearest" # upsample net interpolate mode + freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis + nonlinear_activation: null + nonlinear_activation_params: {} + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### +discriminator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + kernel_size: 3 # Number of output channels. + layers: 10 # Number of conv layers. + conv_channels: 64 # Number of chnn layers. + bias: true # Whether to use bias parameter in conv. + use_weight_norm: true # Whether to use weight norm. + # If set to true, it will be applied to all of the conv layers. + nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv. + nonlinear_activation_params: # Nonlinear function parameters + negative_slope: 0.2 # Alpha in LeakyReLU. + +########################################################### +# STFT LOSS SETTING # +########################################################### +stft_loss_params: + fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. + hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss + win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. + window: "hann" # Window function for STFT-based loss + +########################################################### +# ADVERSARIAL LOSS SETTING # +########################################################### +lambda_adv: 4.0 # Loss balancing coefficient. + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 6 # Batch size. +batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size. +pin_memory: true # Whether to pin memory in Pytorch DataLoader. +num_workers: 4 # Number of workers in Pytorch DataLoader. +remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. +allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. + +########################################################### +# OPTIMIZER & SCHEDULER SETTING # +########################################################### +generator_optimizer_params: + epsilon: 1.0e-6 # Generator's epsilon. + weight_decay: 0.0 # Generator's weight decay coefficient. +generator_scheduler_params: + learning_rate: 0.0001 # Generator's learning rate. + step_size: 100000 # Generator's scheduler step size. + gamma: 0.5 # Generator's scheduler gamma. + # At each step size, lr will be multiplied by this parameter. +generator_grad_norm: 10 # Generator's gradient norm. +discriminator_optimizer_params: + epsilon: 1.0e-6 # Discriminator's epsilon. + weight_decay: 0.0 # Discriminator's weight decay coefficient. +discriminator_scheduler_params: + learning_rate: 0.00005 # Discriminator's learning rate. + step_size: 100000 # Discriminator's scheduler step size. + gamma: 0.5 # Discriminator's scheduler gamma. + # At each step size, lr will be multiplied by this parameter. +discriminator_grad_norm: 1 # Discriminator's gradient norm. + +########################################################### +# INTERVAL SETTING # +########################################################### +discriminator_train_start_steps: 50000 # Number of steps to start to train discriminator. +train_max_steps: 400000 # Number of training steps. +save_interval_steps: 5000 # Interval steps to save checkpoint. +eval_interval_steps: 1000 # Interval steps to evaluate the network. +log_interval_steps: 100 # Interval steps to record the training log. + +########################################################### +# OTHER SETTING # +########################################################### +num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. diff --git a/parakeet/utils/profile.py b/parakeet/utils/profile.py index 29007a9..cfffb4b 100644 --- a/parakeet/utils/profile.py +++ b/parakeet/utils/profile.py @@ -13,7 +13,9 @@ # limitations under the License. import paddle +from paddle.framework import core from paddle.framework import CUDAPlace +from contextlib import contextmanager def synchronize(): @@ -21,3 +23,12 @@ def synchronize(): place = paddle.fluid.framework._current_expected_place() if isinstance(place, CUDAPlace): paddle.fluid.core._cuda_synchronize(place) + + +@contextmanager +def nvtx_span(name): + try: + core.nvprof_nvtx_push(name) + yield + finally: + core.nvprof_nvtx_pop() From 7e049a774463672b214f10b288bffe157ee2062d Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Fri, 25 Jun 2021 01:01:30 +0800 Subject: [PATCH 36/47] add expanuser for preprocess, updater backward testing for pwg --- examples/parallelwave_gan/baker/preprocess.py | 4 +- tests/test_pwg.py | 54 +++++++++++++------ 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/examples/parallelwave_gan/baker/preprocess.py b/examples/parallelwave_gan/baker/preprocess.py index 09f2004..3ae461b 100644 --- a/examples/parallelwave_gan/baker/preprocess.py +++ b/examples/parallelwave_gan/baker/preprocess.py @@ -232,8 +232,8 @@ def main(): print(vars(args)) print(C) - root_dir = Path(args.rootdir) - dumpdir = Path(args.dumpdir) + root_dir = Path(args.rootdir).expanduser() + dumpdir = Path(args.dumpdir).expanduser() dumpdir.mkdir(parents=True, exist_ok=True) wav_files = sorted(list((root_dir / "Wave").rglob("*.wav"))) diff --git a/tests/test_pwg.py b/tests/test_pwg.py index 41027b5..0978714 100644 --- a/tests/test_pwg.py +++ b/tests/test_pwg.py @@ -71,8 +71,8 @@ def test_convin_upsample_net(): print(out2.data.cpu().numpy()[0]) print("backward check") - print(net.conv_in.weight.numpy()[0]) - print(net2.conv_in.weight.data.cpu().numpy()[0]) + print(net.conv_in.weight.grad.numpy()[0]) + print(net2.conv_in.weight.grad.data.cpu().numpy()[0]) def test_residual_block(): @@ -87,23 +87,40 @@ def test_residual_block(): c = paddle.randn([4, 80, 180]) res, skip = net(x, c) res2, skip2 = net2(torch.as_tensor(x.numpy()), torch.as_tensor(c.numpy())) + + print("forward:") print(res.numpy()[0]) print(res2.data.cpu().numpy()[0]) print(skip.numpy()[0]) print(skip2.data.cpu().numpy()[0]) + (res.sum() + skip.sum()).backward() + (res2.sum() + skip2.sum()).backward() + + print("backward:") + print(net.conv.weight.grad.numpy().squeeze()[0]) + print(net2.conv.weight.grad.data.cpu().numpy().squeeze()[0]) + def test_pwg_generator(): net = PWGGenerator( + layers=9, + stacks=3, + upsample_scales=[4, 4, 4, 4], nonlinear_activation="LeakyReLU", - nonlinear_activation_params={"negative_slope": 0.2}) - net2 = pwgan.ParallelWaveGANGenerator(upsample_params={ - "upsample_scales": [4, 4, 4, 4], - "nonlinear_activation": "LeakyReLU", - "nonlinear_activation_params": { - "negative_slope": 0.2 - } - }).to(device) + nonlinear_activation_params={"negative_slope": 0.5}, + use_weight_norm=True) + net2 = pwgan.ParallelWaveGANGenerator( + layers=9, + stacks=3, + upsample_params={ + "upsample_scales": [4, 4, 4, 4], + "nonlinear_activation": "LeakyReLU", + "nonlinear_activation_params": { + "negative_slope": 0.5 + } + }, + use_weight_norm=True).to(device) summary(net) summary(net2) for k, v in net2.named_parameters(): @@ -112,8 +129,8 @@ def test_pwg_generator(): p.set_value(v.data.cpu().numpy().reshape([-1])) else: p.set_value(v.data.cpu().numpy()) - x = paddle.randn([4, 1, 180 * 256]) - c = paddle.randn([4, 80, 180 + 4]) + x = paddle.randn([4, 1, 80 * 256]) + c = paddle.randn([4, 80, 80 + 4]) synchronize() with timer(unit='s') as t: @@ -147,8 +164,13 @@ def test_pwg_generator(): print(out2.data.cpu().numpy()[0]) print("test backward:") - print(net.first_conv.weight.numpy()[0]) - print(net2.first_conv.weight.data.cpu().numpy()[0]) + print("wv") + print(net.first_conv.weight_v.grad.numpy().squeeze()) + print(net2.first_conv.weight_v.grad.data.cpu().numpy().squeeze()) + + print("wg") + print(net.first_conv.weight_g.grad.numpy().squeeze()) + print(net2.first_conv.weight_g.grad.data.cpu().numpy().squeeze()) # print(out.shape) @@ -195,8 +217,8 @@ def test_pwg_discriminator(): print(y2.data.cpu().numpy()[0]) print("test backward:") - print(net.conv_layers[0].weight.numpy()[0]) - print(net2.conv_layers[0].weight.data.cpu().numpy()[0]) + print(net.conv_layers[0].weight_v.grad.numpy().squeeze()) + print(net2.conv_layers[0].weight_v.grad.data.cpu().numpy().squeeze()) def test_residual_pwg_discriminator(): From 3e8a156348b93e388cccc5fa3215c9aa87cd0dfd Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Sat, 26 Jun 2021 19:31:46 +0800 Subject: [PATCH 37/47] add synthesis script for pwg --- examples/parallelwave_gan/baker/synthesize.py | 81 +++++++++++++++++++ examples/parallelwave_gan/baker/train.py | 4 +- parakeet/models/parallel_wavegan.py | 2 +- 3 files changed, 84 insertions(+), 3 deletions(-) create mode 100644 examples/parallelwave_gan/baker/synthesize.py diff --git a/examples/parallelwave_gan/baker/synthesize.py b/examples/parallelwave_gan/baker/synthesize.py new file mode 100644 index 0000000..4c4e754 --- /dev/null +++ b/examples/parallelwave_gan/baker/synthesize.py @@ -0,0 +1,81 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import logging +import argparse +from pathlib import Path + +import yaml +import jsonlines +import paddle +import numpy as np +import soundfile as sf +from paddle import distributed as dist + +from parakeet.datasets.data_table import DataTable +from parakeet.models.parallel_wavegan import PWGGenerator + +from config import get_cfg_default + +parser = argparse.ArgumentParser( + description="synthesize with parallel wavegan.") +parser.add_argument( + "--config", type=str, help="config file to overwrite default config") +parser.add_argument("--params", type=str, help="generator parameter file") +parser.add_argument("--test-metadata", type=str, help="dev data") +parser.add_argument("--output-dir", type=str, help="output dir") +parser.add_argument("--verbose", type=int, default=1, help="verbose") + +args = parser.parse_args() +config = get_cfg_default() +if args.config: + config.merge_from_file(args.config) + +print("========Args========") +print(yaml.safe_dump(vars(args))) +print("========Config========") +print(config) +print( + f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" +) + +generator = PWGGenerator(**config["generator_params"]) +state_dict = paddle.load(args.params) +generator.set_state_dict(state_dict) + +generator.remove_weight_norm() +generator.eval() +with jsonlines.open(args.test_metadata, 'r') as reader: + metadata = list(reader) + +test_dataset = DataTable( + metadata, + fields=['utt_id', 'feats'], + converters={ + 'utt_id': None, + 'feats': np.load, + }) +output_dir = Path(args.output_dir) +output_dir.mkdir(parents=True, exist_ok=True) + +for example in test_dataset: + utt_id = example['utt_id'] + mel = example['feats'] + mel = paddle.to_tensor(mel) # (T, C) + wav = generator.inference(c=mel) + wav = wav.numpy() + print(f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}") + sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000) diff --git a/examples/parallelwave_gan/baker/train.py b/examples/parallelwave_gan/baker/train.py index 2854b66..7232f0b 100644 --- a/examples/parallelwave_gan/baker/train.py +++ b/examples/parallelwave_gan/baker/train.py @@ -134,7 +134,8 @@ def train_sp(args, config): parameters=generator.parameters(), **config["generator_optimizer_params"]) lr_schedule_d = StepDecay(**config["discriminator_scheduler_params"]) - gradient_clip_d = nn.ClipGradByGlobalNorm(config["discriminator_grad_norm"]) + gradient_clip_d = nn.ClipGradByGlobalNorm(config[ + "discriminator_grad_norm"]) optimizer_d = Adam( learning_rate=lr_schedule_d, grad_clip=gradient_clip_d, @@ -180,7 +181,6 @@ def train_sp(args, config): }, dataloader=dev_dataloader, lambda_adv=config.lambda_adv, ) - trainer = Trainer( updater, stop_trigger=(config.train_max_steps, "iteration"), diff --git a/parakeet/models/parallel_wavegan.py b/parakeet/models/parallel_wavegan.py index 6f2b44b..ea183ef 100644 --- a/parakeet/models/parallel_wavegan.py +++ b/parakeet/models/parallel_wavegan.py @@ -519,7 +519,7 @@ class PWGGenerator(nn.Layer): if c is not None: c = paddle.transpose(c, [1, 0]).unsqueeze(0) # pseudo batch - c = nn.Pad1D(self.aux_context_window, mode='edge')(c) + c = nn.Pad1D(self.aux_context_window, mode='replicate')(c) out = self.forward(x, c).squeeze(0).transpose([1, 0]) return out From f9105db72743e7b90f0b77055d3d5335da0014e8 Mon Sep 17 00:00:00 2001 From: iclementine Date: Sat, 26 Jun 2021 22:16:56 +0800 Subject: [PATCH 38/47] make a better snapshot extension --- parakeet/training/extensions/snapshot.py | 54 ++++++++++++++++++++--- parakeet/training/trainer.py | 8 +++- parakeet/training/updater.py | 2 +- tests/test_snapshot.py | 55 ++++++++++++++++++++++++ 4 files changed, 110 insertions(+), 9 deletions(-) create mode 100644 tests/test_snapshot.py diff --git a/parakeet/training/extensions/snapshot.py b/parakeet/training/extensions/snapshot.py index e31403b..9cafef1 100644 --- a/parakeet/training/extensions/snapshot.py +++ b/parakeet/training/extensions/snapshot.py @@ -12,8 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Union +from typing import Union, List, Dict, Any from pathlib import Path +import jsonlines +import os +from datetime import datetime +import logging from parakeet.utils.mp_tools import rank_zero_only from parakeet.training.trainer import Trainer @@ -24,7 +28,7 @@ class Snapshot(object): the trainer. It is done by calling the updater's `save` method. An Updater save its state_dict by default, which contains the - updater state, (i.e. epoch and iteration) and all the model + updater state, (i.e. epoch and iteration) and all the model parameters and optimizer states. If the updater inside the trainer subclasses StandardUpdater, everything is good to go. @@ -34,11 +38,47 @@ class Snapshot(object): The directory to save checkpoints into. """ - def __init__(self, checkpoint_dir: Union[str, Path]): - self.checkpoint_dir = Path(checkpoint_dir) + def __init__(self, max_size: int=5): + self.records: List[Dict[str, Any]] = [] + self.max_size = max_size + self._save_all = (max_size == -1) + self.save_fn =... + self.del_fn =... + self.checkpoint_dir =... + + def initialize(self, trainer): + """setting up this extention.""" + self.save_fn = trainer.updater.save + self.del_fn = os.remove + self.checkpoint_dir = trainer.out / "checkpoints" + + def full(self): + return (not self._save_all) and len(self.records) >= self.max_size @rank_zero_only - def __call__(self, trainer: Trainer): + def save_checkpoint_and_update(self, trainer): iteration = trainer.updater.state.iteration - path = self.checkpoint_dir / f"step_{iteration}.pdz" - trainer.updater.save(str(path)) + path = self.checkpoint_dir / f"snapshot_iter_{iteration}.pdz" + + # remove the earist + if self.full(): + eariest_record = self.records[0] + self.del_fn(eariest_record["path"]) + self.records.pop(0) + + # add the new one + self.save_fn(path) + record = { + "time": str(datetime.now()), + 'path': str(path), + 'iteration': iteration + } + self.records.append(record) + + # update the record + with jsonlines.open(self.checkpoint_dir / "records.jsonl", 'w') as f: + for record in self.records: + f.write(record) + + def __call__(self, trainer): + self.save_checkpoint_and_update(trainer) diff --git a/parakeet/training/trainer.py b/parakeet/training/trainer.py index 99e5114..38ccefb 100644 --- a/parakeet/training/trainer.py +++ b/parakeet/training/trainer.py @@ -59,7 +59,13 @@ class Trainer(object): self.extensions.keys(), key=lambda name: self.extensions[name].priority, reverse=True) - extensions = [(name, self.extensions[name]) for name in extension_order] + extensions = [(name, self.extensions[name]) + for name in extension_order] + + print("initializing") + for name, entry in extensions: + if hasattr(entry.extension, "initialize"): + entry.extension.initialize(self) update = self.updater.update stop_trigger = self.stop_trigger diff --git a/parakeet/training/updater.py b/parakeet/training/updater.py index cb2213c..fb3bb41 100644 --- a/parakeet/training/updater.py +++ b/parakeet/training/updater.py @@ -198,7 +198,7 @@ class StandardUpdater(UpdaterBase): return state_dict def set_state_dict(self, state_dict): - """Set state dict for a Updater. Parameters of models, states for + """Set state dict for a Updater. Parameters of models, states for optimizers and UpdaterState are restored.""" for name, layer in self.models.items(): layer.set_state_dict(state_dict[f"{name}_params"]) diff --git a/tests/test_snapshot.py b/tests/test_snapshot.py new file mode 100644 index 0000000..71e422c --- /dev/null +++ b/tests/test_snapshot.py @@ -0,0 +1,55 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +import shutil + +import numpy as np +import paddle +from paddle import nn +from paddle.optimizer import Adam +from itertools import count + +from parakeet.training.updater import StandardUpdater +from parakeet.training.trainer import Trainer +from parakeet.training.extensions.snapshot import Snapshot + + +def test_snapshot(): + model = nn.Linear(3, 4) + optimizer = Adam(parameters=model.parameters()) + + # use a simplest iterable object as dataloader + dataloader = count() + + # hack the training proecss: training does nothing except increse iteration + updater = StandardUpdater(model, optimizer, dataloader=dataloader) + updater.update_core = lambda x: None + + trainer = Trainer( + updater, stop_trigger=(1000, 'iteration'), out='temp_test_snapshot') + shutil.rmtree(trainer.out, ignore_errors=True) + + snap = Snapshot(max_size=5) + trigger = (10, 'iteration') + trainer.extend(snap, name='snapshot', trigger=trigger, priority=0) + + trainer.run() + + checkpoint_dir = trainer.out / "checkpoints" + snapshots = sorted(list(checkpoint_dir.glob("snapshot_iter_*.pdz"))) + for snap in snapshots: + print(snap) + assert len(snapshots) == 5 + shutil.rmtree(trainer.out) From 61c13dd69b687bd57161f0c5bc7f94bf7b7af149 Mon Sep 17 00:00:00 2001 From: iclementine Date: Sun, 27 Jun 2021 08:32:37 +0800 Subject: [PATCH 39/47] add better traininig utility code --- parakeet/__init__.py | 3 + parakeet/training/checkpoint.py | 163 ------------------ parakeet/training/extension.py | 80 +++++++++ parakeet/training/extensions/evaluator.py | 28 ++- parakeet/training/extensions/snapshot.py | 72 +++++--- parakeet/training/extensions/visualizer.py | 23 ++- parakeet/training/seeding.py | 3 +- parakeet/training/trainer.py | 72 ++++++-- parakeet/training/trigger.py | 16 +- .../training/triggers/interval_trigger.py | 35 ++++ parakeet/training/triggers/time_trigger.py | 35 ++++ parakeet/training/updater.py | 122 ------------- .../training/updaters/standard_updater.py | 152 ++++++++++++++++ tests/test_checkpoint.py | 56 ------ 14 files changed, 452 insertions(+), 408 deletions(-) delete mode 100644 parakeet/training/checkpoint.py create mode 100644 parakeet/training/extension.py create mode 100644 parakeet/training/triggers/interval_trigger.py create mode 100644 parakeet/training/triggers/time_trigger.py create mode 100644 parakeet/training/updaters/standard_updater.py delete mode 100644 tests/test_checkpoint.py diff --git a/parakeet/__init__.py b/parakeet/__init__.py index d4940a3..cce4fff 100644 --- a/parakeet/__init__.py +++ b/parakeet/__init__.py @@ -14,4 +14,7 @@ __version__ = "0.2.0-beta.0" +import logging from parakeet import audio, data, datasets, frontend, models, modules, training, utils + +logging.getLogger('parakeet').addHandler(logging.NullHandler()) diff --git a/parakeet/training/checkpoint.py b/parakeet/training/checkpoint.py deleted file mode 100644 index bbbbdc0..0000000 --- a/parakeet/training/checkpoint.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Callable, Mapping, List, Union -import os -from pathlib import Path - - -class KBest(object): - """ - A utility class to help save the hard drive by only keeping K best - checkpoints. - - To be as modularized as possible, this class does not assume anything like - a Trainer class or anything like a checkpoint directory, it does not know - about the model or the optimizer, etc. - - It is basically a dynamically mantained K-bset Mapping. When a new item is - added to the map, save_fn is called. And when an item is removed from the - map, del_fn is called. `save_fn` and `del_fn` takes a Path object as input - and returns nothing. - - Though it is designed to control checkpointing behaviors, it can be used - to do something else if you pass some save_fn and del_fn. - - Example - -------- - - >>> from pathlib import Path - >>> import shutil - >>> import paddle - >>> from paddle import nn - - >>> model = nn.Linear(2, 3) - >>> def save_model(path): - ... paddle.save(model.state_dict(), path) - - >>> kbest_manager = KBest(max_size=5, save_fn=save_model) - >>> checkpoint_dir = Path("checkpoints") - >>> shutil.rmtree(checkpoint_dir) - >>> checkpoint_dir.mkdir(parents=True) - >>> a = np.random.rand(20) - >>> for i, score in enumerate(a): - ... path = checkpoint_dir / f"step_{i}" - ... kbest_manager.add_checkpoint(score, path) - >>> assert len(list(checkpoint_dir.glob("step_*"))) == 5 - """ - - def __init__(self, - max_size: int=5, - save_fn: Callable[[Union[Path, str]], None]=None, - del_fn: Callable[[Union[Path, str]], None]=os.remove): - self.best_records: Mapping[Path, float] = {} - self.save_fn = save_fn - self.del_fn = del_fn - self.max_size = max_size - self._save_all = (max_size == -1) - - def should_save(self, metric: float) -> bool: - if not self.full(): - return True - - # already full - worst_record_path = max(self.best_records, key=self.best_records.get) - worst_metric = self.best_records[worst_record_path] - return metric < worst_metric - - def full(self): - return (not self._save_all) and len(self.best_records) == self.max_size - - def add_checkpoint(self, metric, path): - if self.should_save(metric): - self.save_checkpoint_and_update(metric, path) - - def save_checkpoint_and_update(self, metric, path): - # remove the worst - if self.full(): - worst_record_path = max(self.best_records, - key=self.best_records.get) - self.best_records.pop(worst_record_path) - self.del_fn(worst_record_path) - - # add the new one - self.save_fn(path) - self.best_records[path] = metric - - -class KLatest(object): - """ - A utility class to help save the hard drive by only keeping K latest - checkpoints. - - To be as modularized as possible, this class does not assume anything like - a Trainer class or anything like a checkpoint directory, it does not know - about the model or the optimizer, etc. - - It is basically a dynamically mantained Queue. When a new item is - added to the queue, save_fn is called. And when an item is removed from the - queue, del_fn is called. `save_fn` and `del_fn` takes a Path object as input - and returns nothing. - - Though it is designed to control checkpointing behaviors, it can be used - to do something else if you pass some save_fn and del_fn. - - Example - -------- - - >>> from pathlib import Path - >>> import shutil - >>> import paddle - >>> from paddle import nn - - >>> model = nn.Linear(2, 3) - >>> def save_model(path): - ... paddle.save(model.state_dict(), path) - - >>> klatest_manager = KLatest(max_size=5, save_fn=save_model) - >>> checkpoint_dir = Path("checkpoints") - >>> shutil.rmtree(checkpoint_dir) - >>> checkpoint_dir.mkdir(parents=True) - >>> for i in range(20): - ... path = checkpoint_dir / f"step_{i}" - ... klatest_manager.add_checkpoint(path) - >>> assert len(list(checkpoint_dir.glob("step_*"))) == 5 - """ - - def __init__(self, - max_size: int=5, - save_fn: Callable[[Path], None]=None, - del_fn: Callable[[Path], None]=lambda f: f.unlink()): - self.latest_records: List[Path] = [] - self.save_fn = save_fn - self.del_fn = del_fn - self.max_size = max_size - self._save_all = (max_size == -1) - - def full(self): - return ( - not self._save_all) and len(self.latest_records) == self.max_size - - def add_checkpoint(self, path): - self.save_checkpoint_and_update(path) - - def save_checkpoint_and_update(self, path): - # remove the earist - if self.full(): - eariest_record_path = self.latest_records.pop(0) - self.del_fn(eariest_record_path) - - # add the new one - self.save_fn(path) - self.latest_records.append(path) diff --git a/parakeet/training/extension.py b/parakeet/training/extension.py new file mode 100644 index 0000000..57c4f29 --- /dev/null +++ b/parakeet/training/extension.py @@ -0,0 +1,80 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable + +PRIORITY_WRITER = 300 +PRIORITY_EDITOR = 200 +PRIORITY_READER = 100 + + +class Extension(object): + """Extension to customize the behavior of Trainer.""" + trigger = (1, 'iteration') + priority = PRIORITY_READER + name = None + + @property + def default_name(self): + """Default name of the extension, class name by default.""" + return type(self).__name__ + + def __call__(self, trainer): + """Main action of the extention. After each update, it is executed + when the trigger fires.""" + raise NotImplementedError( + 'Extension implementation must override __call__.') + + def initialize(self, trainer): + """Action that is executed once to get the corect trainer state. + It is called before training normally, but if the trainer restores + states with an Snapshot extension, this method should also be called.g + """ + pass + + def on_error(self, trainer, exc, tb): + """Handles the error raised during training before finalization. + """ + pass + + def finalize(self, trainer): + """Action that is executed when training is done. + For example, visualizers would need to be closed. + """ + pass + + +def make_extension(trigger: Callable=None, + default_name: str=None, + priority: int=None, + finalizer: Callable=None, + initializer: Callable=None, + on_error: Callable=None): + """Make an Extension-like object by injecting required attributes to it. + """ + if trigger is None: + trigger = Extension.trigger + if priority is None: + priority = Extension.priority + + def decorator(ext): + ext.trigger = trigger + ext.default_name = default_name or ext.__name__ + ext.priority = priority + ext.finalize = finalizer + ext.on_error = on_error + ext.initialize = initializer + return ext + + return decorator diff --git a/parakeet/training/extensions/evaluator.py b/parakeet/training/extensions/evaluator.py index eb3f877..6ebaae6 100644 --- a/parakeet/training/extensions/evaluator.py +++ b/parakeet/training/extensions/evaluator.py @@ -22,9 +22,17 @@ from paddle.nn import Layer from paddle.io import DataLoader from parakeet.training.reporter import scope, report, DictSummary +from parakeet.training import extension -class StandardEvaluator(object): +class StandardEvaluator(extension.Extension): + + trigger = (1, 'epoch') + default_name = 'validation' + priority = extension.PRIORITY_WRITER + + name = None + def __init__(self, model: Layer, dataloader: DataLoader): # it is designed to hold multiple models models = {"main": model} @@ -43,21 +51,23 @@ class StandardEvaluator(object): for layer in self.models.values(): layer.eval() + # to average evaluation metrics summary = DictSummary() for batch in self.dataloader: observation = {} with scope(observation): + # main evaluation computation here. with paddle.no_grad(): - self.evaluate_core( - batch) # main evaluation computation here. + self.evaluate_core(batch) summary.add(observation) summary = summary.compute_mean() return summary def __call__(self, trainer=None): - self.observation = {} - with scope(self.observation): - summary = self.evaluate() - for k, v in summary.items(): - report(k, v) - print(self.observation) + # evaluate and report the averaged metric to current observation + # if it is used to extend a trainer, the metrics is reported to + # to observation of the trainer + # or otherwise, you can use your own observation + summary = self.evaluate() + for k, v in summary.items(): + report(k, v) diff --git a/parakeet/training/extensions/snapshot.py b/parakeet/training/extensions/snapshot.py index 9cafef1..853d62e 100644 --- a/parakeet/training/extensions/snapshot.py +++ b/parakeet/training/extensions/snapshot.py @@ -12,18 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Union, List, Dict, Any -from pathlib import Path -import jsonlines import os +from pathlib import Path from datetime import datetime -import logging +from typing import List, Dict, Any + +import jsonlines from parakeet.utils.mp_tools import rank_zero_only from parakeet.training.trainer import Trainer +from parakeet.training import extension -class Snapshot(object): +def load_records(records_fp): + """Load record files (json lines.)""" + with jsonlines.open(records_fp, 'r') as reader: + records = list(reader) + return records + + +class Snapshot(extension.Extension): """An extension to make snapshot of the updater object inside the trainer. It is done by calling the updater's `save` method. @@ -38,34 +46,46 @@ class Snapshot(object): The directory to save checkpoints into. """ - def __init__(self, max_size: int=5): + trigger = (1, 'epoch') + priority = -100 + + def __init__(self, max_size: int=5, snapshot_on_error: bool=False): self.records: List[Dict[str, Any]] = [] self.max_size = max_size + self._snapshot_on_error = snapshot_on_error self._save_all = (max_size == -1) self.save_fn =... - self.del_fn =... + self.del_fn = os.remove self.checkpoint_dir =... - def initialize(self, trainer): - """setting up this extention.""" + def initialize(self, trainer: Trainer): + """Setting up this extention.""" self.save_fn = trainer.updater.save - self.del_fn = os.remove self.checkpoint_dir = trainer.out / "checkpoints" + # load existing records + record_path: Path = self.checkpoint_dir / "records.yaml" + if record_path.exists(): + self.records = load_records(record_path) + + def on_error(self, trainer, exc, tb): + if self._snapshot_on_error: + self.save_checkpoint_and_update(trainer) + + def __call__(self, trainer: Trainer): + self.save_checkpoint_and_update(trainer) + def full(self): - return (not self._save_all) and len(self.records) >= self.max_size + """Whether the number of snapshots it keeps track of is greater + than the max_size.""" + return (not self._save_all) and len(self.records) > self.max_size @rank_zero_only - def save_checkpoint_and_update(self, trainer): + def save_checkpoint_and_update(self, trainer: Trainer): + """Saving new snapshot and remove the oldest snapshot if needed.""" iteration = trainer.updater.state.iteration path = self.checkpoint_dir / f"snapshot_iter_{iteration}.pdz" - # remove the earist - if self.full(): - eariest_record = self.records[0] - self.del_fn(eariest_record["path"]) - self.records.pop(0) - # add the new one self.save_fn(path) record = { @@ -75,10 +95,14 @@ class Snapshot(object): } self.records.append(record) - # update the record - with jsonlines.open(self.checkpoint_dir / "records.jsonl", 'w') as f: - for record in self.records: - f.write(record) + # remove the earist + if self.full(): + eariest_record = self.records[0] + self.del_fn(eariest_record["path"]) + self.records.pop(0) - def __call__(self, trainer): - self.save_checkpoint_and_update(trainer) + # update the record file + record_path = self.checkpoint_dir / "records.jsonl" + with jsonlines.open(record_path, 'w') as writer: + for record in self.records: + writer.write(record) diff --git a/parakeet/training/extensions/visualizer.py b/parakeet/training/extensions/visualizer.py index 9a1bcbb..2a42ae0 100644 --- a/parakeet/training/extensions/visualizer.py +++ b/parakeet/training/extensions/visualizer.py @@ -13,22 +13,31 @@ # limitations under the License. from visualdl import LogWriter + from parakeet.training.trainer import Trainer -from parakeet.utils.mp_tools import rank_zero_only +from parakeet.training import extension -class VisualDL(object): +class VisualDL(extension.Extension): """A wrapper of visualdl log writer. It assumes that the metrics to be visualized - are all scalars which are recorded into the `.observation` dictionary of the - trainer object. The dictionary is created for each step, thus the visualdl log + are all scalars which are recorded into the `.observation` dictionary of the + trainer object. The dictionary is created for each step, thus the visualdl log writer uses the iteration from the updater's `iteration` as the global step to add records. """ + trigger = (1, 'iteration') + default_name = 'visualdl' + priority = extension.PRIORITY_READER - def __init__(self, writer: LogWriter): - self.writer = writer + def __init__(self): + self.writer =... + + def initialize(self, trainer): + self.writer = LogWriter(logdir=str(trainer.out)) - @rank_zero_only def __call__(self, trainer: Trainer): for k, v in trainer.observation.items(): self.writer.add_scalar(k, v, step=trainer.updater.state.iteration) + + def finalize(self, trainer): + self.writer.close() diff --git a/parakeet/training/seeding.py b/parakeet/training/seeding.py index 1a6660f..1663d2d 100644 --- a/parakeet/training/seeding.py +++ b/parakeet/training/seeding.py @@ -12,14 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import random import logging import paddle -import random import numpy as np def seed_everything(seed: int): + """Seed paddle, random and np.random to help reproductivity.""" paddle.seed(seed) random.seed(seed) np.random.seed(seed) diff --git a/parakeet/training/trainer.py b/parakeet/training/trainer.py index 38ccefb..f4b6fbb 100644 --- a/parakeet/training/trainer.py +++ b/parakeet/training/trainer.py @@ -13,15 +13,18 @@ # limitations under the License. from pathlib import Path +from collections import OrderedDict +from typing import Callable, Union, List + import tqdm -from dataclasses import dataclass from parakeet.training.trigger import get_trigger, IntervalTrigger from parakeet.training.updater import UpdaterBase from parakeet.training.reporter import scope +from parakeet.training.extension import Extension, PRIORITY_READER -class ExtensionEntry(object): +class _ExtensionEntry(object): def __init__(self, extension, trigger, priority): self.extension = extension self.trigger = trigger @@ -31,29 +34,76 @@ class ExtensionEntry(object): class Trainer(object): def __init__(self, updater: UpdaterBase, - stop_trigger=None, - out='result', - extensions=None): + stop_trigger: Callable=None, + out: Union[str, Path]='result', + extensions: List[Extension]=None): self.updater = updater - self.extensions = {} + self.extensions = OrderedDict() self.stop_trigger = get_trigger(stop_trigger) self.out = Path(out) - self.observation = {} + self.observation =... + + self._done = False + if extensions: + for ext in extensions: + self.extend(ext) + + @property + def is_before_training(self): + return self.updater.state.iteration == 0 def extend(self, extension, name=None, trigger=None, priority=None): + # get name for the extension + # argument \ + # -> extention's name \ + # -> default_name (class name, when it is an object) \ + # -> function name when it is a function \ + # -> error + + if name is None: + name = getattr(extension, 'name', None) + if name is None: + name = getattr(extenion, 'default_name', None) + if name is None: + name = getattr(extension, '__name__', None) + if name is None: + raise ValueError( + "Name is not given for the extension.") + if name == 'training': + raise ValueError("training is a reserved name.") + + if trigger is None: + trigger = getattr(extension, 'trigger', (1, 'iteration')) trigger = get_trigger(trigger) + if priority is None: + priority = getattr(extension, 'priority', PRIORITY_READER) + + # add suffix to avoid nameing conflict ordinal = 0 modified_name = name while modified_name in self.extensions: - print(self.extensions.keys()) ordinal += 1 modified_name = f"{name}_{ordinal}" + extension.name = modified_name - self.extensions[modified_name] = ExtensionEntry(extension, trigger, - priority) + self.extensions[modified_name] = _ExtensionEntry(extension, trigger, + priority) + + def get_extension(self, name): + """get extension by name.""" + extensions = self.extensions + if name in extensions: + return extensions[name].extension + else: + raise ValueError(f'extension {name} not found') def run(self): + if self._done: + raise RuntimeError("Training is already done!.") + + self.out.mkdir(parents=True, exist_ok=True) + # sort extensions by priorities once extension_order = sorted( self.extensions.keys(), @@ -67,7 +117,7 @@ class Trainer(object): if hasattr(entry.extension, "initialize"): entry.extension.initialize(self) - update = self.updater.update + update = self.updater.update # training step stop_trigger = self.stop_trigger # TODO(chenfeiyu): display progress bar correctly diff --git a/parakeet/training/trigger.py b/parakeet/training/trigger.py index 5d165a6..f5834bc 100644 --- a/parakeet/training/trigger.py +++ b/parakeet/training/trigger.py @@ -12,21 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. - -class IntervalTrigger(object): - def __init__(self, period: int, unit: str): - if unit not in ("iteration", "epoch"): - raise ValueError("unit should be 'iteration' or 'epoch'") - self.period = period - self.unit = unit - - def __call__(self, trainer): - state = trainer.updater.state - if self.unit == "epoch": - fire = state.epoch % self.period == 0 - else: - fire = state.iteration % self.period == 0 - return fire +from parakeet.training.triggers.interval_trigger import IntervalTrigger def never_file_trigger(trainer): diff --git a/parakeet/training/triggers/interval_trigger.py b/parakeet/training/triggers/interval_trigger.py new file mode 100644 index 0000000..82f441e --- /dev/null +++ b/parakeet/training/triggers/interval_trigger.py @@ -0,0 +1,35 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class IntervalTrigger(object): + """A Predicate to do something every N cycle.""" + + def __init__(self, period: int, unit: str): + if unit not in ("iteration", "epoch"): + raise ValueError("unit should be 'iteration' or 'epoch'") + self.period = period + self.unit = unit + + def __call__(self, trainer): + state = trainer.updater.state + # we use a special scheme so we can use iteration % period == 0 as + # the predicate + # increase the iteration then update parameters + # instead of updating then increase iteration + if self.unit == "epoch": + fire = state.epoch % self.period == 0 + else: + fire = state.iteration % self.period == 0 + return fire diff --git a/parakeet/training/triggers/time_trigger.py b/parakeet/training/triggers/time_trigger.py new file mode 100644 index 0000000..aff9382 --- /dev/null +++ b/parakeet/training/triggers/time_trigger.py @@ -0,0 +1,35 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class TimeTrigger(object): + """Trigger based on a fixed time interval. + + This trigger accepts iterations with a given interval time. + + Args: + period (float): Interval time. It is given in seconds. + + """ + + def __init__(self, period): + self._period = period + self._next_time = self._period + + def __call__(self, trainer): + if self._next_time < trainer.elapsed_time: + self._next_time += self._period + return True + else: + return False diff --git a/parakeet/training/updater.py b/parakeet/training/updater.py index fb3bb41..2d9ec3d 100644 --- a/parakeet/training/updater.py +++ b/parakeet/training/updater.py @@ -93,125 +93,3 @@ class UpdaterBase(object): def load(self, path): archive = paddle.load(path) self.set_state_dict(archive) - - -class StandardUpdater(UpdaterBase): - """An example of over-simplification. Things may not be that simple, but - you can subclass it to fit your need. - """ - - def __init__(self, - model: Layer, - optimizer: Optimizer, - dataloader: DataLoader, - init_state: Optional[UpdaterState]=None): - # it is designed to hold multiple models - models = {"main": model} - self.models: Dict[str, Layer] = models - self.model = model - - # it is designed to hold multiple optimizers - optimizers = {"main": optimizer} - self.optimizer = optimizer - self.optimizers: Dict[str, Optimizer] = optimizers - - # dataloaders - self.dataloader = dataloader - - # init state - if init_state is None: - self.state = UpdaterState() - else: - self.state = init_state - - self.train_iterator = iter(dataloader) - - def update(self): - self.state.iteration += 1 - - # switch to training mode - for layer in self.models.values(): - layer.train() - - # training for a step is implemented here - batch = self.read_batch() - self.update_core(batch) - - def update_core(self, batch): - """A simple case for a training step. Basic assumptions are: - Single model; - Single optimizer; - A batch from the dataloader is just the input of the model; - The model return a single loss, or a dict containing serval losses. - Parameters updates at every batch, no gradient accumulation. - """ - loss = self.model(*batch) - - if isinstance(loss, Tensor): - loss_dict = {"main": loss} - else: - # Dict[str, Tensor] - loss_dict = loss - if "main" not in loss_dict: - main_loss = 0 - for loss_item in loss.values(): - main_loss += loss_item - loss_dict["main"] = main_loss - - for name, loss_item in loss_dict.items(): - report(name, float(loss_item)) - - self.optimizer.clear_gradient() - loss_dict["main"].backward() - self.optimizer.update() - - def new_epoch(self): - """Start a new epoch.""" - self.state.epoch += 1 - - # NOTE: all batch sampler for distributed training should - # subclass DistributedBatchSampler and implement `set_epoch` method - batch_sampler = self.dataloader.batch_sampler - if isinstance(batch_sampler, DistributedBatchSampler): - batch_sampler.set_epoch(self.state.epoch) - self.train_iterator = iter(self.dataloader) - - def read_batch(self): - """Read a batch from the data loader, auto renew when data is exhausted.""" - with timer() as t: - try: - batch = next(self.train_iterator) - except StopIteration: - self.new_epoch() - batch = next(self.train_iterator) - logging.debug( - f"Read a batch takes {t.elapse}s.") # replace it with logging - return batch - - def state_dict(self): - """State dict of a Updater, model, optimizer and updater state are included.""" - state_dict = super().state_dict() - for name, layer in self.models.items(): - state_dict[f"{name}_params"] = layer.state_dict() - for name, optim in self.optimizers.items(): - state_dict[f"{name}_optimizer"] = optim.state_dict() - return state_dict - - def set_state_dict(self, state_dict): - """Set state dict for a Updater. Parameters of models, states for - optimizers and UpdaterState are restored.""" - for name, layer in self.models.items(): - layer.set_state_dict(state_dict[f"{name}_params"]) - for name, optim in self.optimizers.items(): - optim.set_state_dict(state_dict[f"{name}_optimizer"]) - super().set_state_dict(state_dict) - - def save(self, path): - """Save Updater state dict.""" - archive = self.state_dict() - paddle.save(archive, path) - - def load(self, path): - """Load Updater state dict.""" - archive = paddle.load(path) - self.set_state_dict(archive) diff --git a/parakeet/training/updaters/standard_updater.py b/parakeet/training/updaters/standard_updater.py new file mode 100644 index 0000000..5cc0252 --- /dev/null +++ b/parakeet/training/updaters/standard_updater.py @@ -0,0 +1,152 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from dataclasses import dataclass +from typing import Optional +from typing import Dict +from typing import Union + +from timer import timer +import paddle +from paddle import Tensor +from paddle.nn import Layer +from paddle.optimizer import Optimizer +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler + +from parakeet.training.reporter import report +from parakeet.training.updater import UpdaterBase, UpdaterState + + +class StandardUpdater(UpdaterBase): + """An example of over-simplification. Things may not be that simple, but + you can subclass it to fit your need. + """ + + def __init__(self, + model: Layer, + optimizer: Optimizer, + dataloader: DataLoader, + init_state: Optional[UpdaterState]=None): + # it is designed to hold multiple models + models = {"main": model} + self.models: Dict[str, Layer] = models + self.model = model + + # it is designed to hold multiple optimizers + optimizers = {"main": optimizer} + self.optimizer = optimizer + self.optimizers: Dict[str, Optimizer] = optimizers + + # dataloaders + self.dataloader = dataloader + + # init state + if init_state is None: + self.state = UpdaterState() + else: + self.state = init_state + + self.train_iterator = iter(dataloader) + + def update(self): + self.state.iteration += 1 + + # switch to training mode + for layer in self.models.values(): + layer.train() + + # training for a step is implemented here + batch = self.read_batch() + self.update_core(batch) + + def update_core(self, batch): + """A simple case for a training step. Basic assumptions are: + Single model; + Single optimizer; + A batch from the dataloader is just the input of the model; + The model return a single loss, or a dict containing serval losses. + Parameters updates at every batch, no gradient accumulation. + """ + loss = self.model(*batch) + + if isinstance(loss, Tensor): + loss_dict = {"main": loss} + else: + # Dict[str, Tensor] + loss_dict = loss + if "main" not in loss_dict: + main_loss = 0 + for loss_item in loss.values(): + main_loss += loss_item + loss_dict["main"] = main_loss + + for name, loss_item in loss_dict.items(): + report(name, float(loss_item)) + + self.optimizer.clear_gradient() + loss_dict["main"].backward() + self.optimizer.update() + + def new_epoch(self): + """Start a new epoch.""" + self.state.epoch += 1 + + # NOTE: all batch sampler for distributed training should + # subclass DistributedBatchSampler and implement `set_epoch` method + batch_sampler = self.dataloader.batch_sampler + if isinstance(batch_sampler, DistributedBatchSampler): + batch_sampler.set_epoch(self.state.epoch) + self.train_iterator = iter(self.dataloader) + + def read_batch(self): + """Read a batch from the data loader, auto renew when data is exhausted.""" + with timer() as t: + try: + batch = next(self.train_iterator) + except StopIteration: + self.new_epoch() + batch = next(self.train_iterator) + logging.debug( + f"Read a batch takes {t.elapse}s.") # replace it with logging + return batch + + def state_dict(self): + """State dict of a Updater, model, optimizer and updater state are included.""" + state_dict = super().state_dict() + for name, layer in self.models.items(): + state_dict[f"{name}_params"] = layer.state_dict() + for name, optim in self.optimizers.items(): + state_dict[f"{name}_optimizer"] = optim.state_dict() + return state_dict + + def set_state_dict(self, state_dict): + """Set state dict for a Updater. Parameters of models, states for + optimizers and UpdaterState are restored.""" + for name, layer in self.models.items(): + layer.set_state_dict(state_dict[f"{name}_params"]) + for name, optim in self.optimizers.items(): + optim.set_state_dict(state_dict[f"{name}_optimizer"]) + super().set_state_dict(state_dict) + + def save(self, path): + """Save Updater state dict.""" + archive = self.state_dict() + paddle.save(archive, path) + + def load(self, path): + """Load Updater state dict.""" + archive = paddle.load(path) + self.set_state_dict(archive) diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py deleted file mode 100644 index 120115f..0000000 --- a/tests/test_checkpoint.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from pathlib import Path -import shutil - -import numpy as np -from parakeet.training.checkpoint import KBest, KLatest - - -def test_kbest(): - def save_fn(path): - with open(path, 'wt') as f: - f.write(f"My path is {str(path)}\n") - - K = 1 - kbest_manager = KBest(max_size=K, save_fn=save_fn) - checkpoint_dir = Path("checkpoints") - if checkpoint_dir.exists(): - shutil.rmtree(checkpoint_dir) - checkpoint_dir.mkdir(parents=True) - a = np.random.rand(20) - for i, score in enumerate(a): - path = checkpoint_dir / f"step_{i}" - kbest_manager.add_checkpoint(score, path) - assert len(list(checkpoint_dir.glob("step_*"))) == K - shutil.rmtree(checkpoint_dir) - - -def test_klatest(): - def save_fn(path): - with open(path, 'wt') as f: - f.write(f"My path is {str(path)}\n") - - K = 5 - klatest_manager = KLatest(max_size=K, save_fn=save_fn) - checkpoint_dir = Path("checkpoints") - if checkpoint_dir.exists(): - shutil.rmtree(checkpoint_dir) - checkpoint_dir.mkdir(parents=True) - for i in range(20): - path = checkpoint_dir / f"step_{i}" - klatest_manager.add_checkpoint(path) - assert len(list(checkpoint_dir.glob("step_*"))) == K - shutil.rmtree(checkpoint_dir) From 29b8b8b0ea64f3c35408ad52277cfc9acba6dec6 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Sun, 27 Jun 2021 18:53:45 +0800 Subject: [PATCH 40/47] 1. better error handling; 2. use absolute path in snapshot records; 3. visualdl takes a logger as init argument rather than creating one. --- .../parallelwave_gan/baker/conf/default.yaml | 1 + .../parallelwave_gan/baker/pwg_updater.py | 3 +- examples/parallelwave_gan/baker/train.py | 29 ++++---- parakeet/training/extensions/snapshot.py | 15 ++-- parakeet/training/extensions/visualizer.py | 7 +- parakeet/training/trainer.py | 69 ++++++++++++++----- parakeet/training/updater.py | 6 +- 7 files changed, 80 insertions(+), 50 deletions(-) diff --git a/examples/parallelwave_gan/baker/conf/default.yaml b/examples/parallelwave_gan/baker/conf/default.yaml index 29d6c0f..ce5b064 100644 --- a/examples/parallelwave_gan/baker/conf/default.yaml +++ b/examples/parallelwave_gan/baker/conf/default.yaml @@ -125,3 +125,4 @@ log_interval_steps: 100 # Interval steps to record the training # OTHER SETTING # ########################################################### num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. +num_snapshots: 10 \ No newline at end of file diff --git a/examples/parallelwave_gan/baker/pwg_updater.py b/examples/parallelwave_gan/baker/pwg_updater.py index bd7dbeb..f7ff916 100644 --- a/examples/parallelwave_gan/baker/pwg_updater.py +++ b/examples/parallelwave_gan/baker/pwg_updater.py @@ -24,11 +24,10 @@ from paddle.io import DistributedBatchSampler from timer import timer from parakeet.datasets.data_table import DataTable -from parakeet.training.updater import UpdaterBase, UpdaterState, StandardUpdater +from parakeet.training.updaters.standard_updater import StandardUpdater, UpdaterState from parakeet.training.extensions.evaluator import StandardEvaluator from parakeet.training.trainer import Trainer from parakeet.training.reporter import report -from parakeet.training.checkpoint import KBest, KLatest from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator from parakeet.modules.stft_loss import MultiResolutionSTFTLoss from parakeet.utils.profile import synchronize diff --git a/examples/parallelwave_gan/baker/train.py b/examples/parallelwave_gan/baker/train.py index 7232f0b..bf8767a 100644 --- a/examples/parallelwave_gan/baker/train.py +++ b/examples/parallelwave_gan/baker/train.py @@ -36,11 +36,11 @@ from parakeet.datasets.data_table import DataTable from parakeet.training.updater import UpdaterBase from parakeet.training.trainer import Trainer from parakeet.training.reporter import report -from parakeet.training.checkpoint import KBest, KLatest +from parakeet.training import extension +from parakeet.training.extensions.snapshot import Snapshot +from parakeet.training.extensions.visualizer import VisualDL from parakeet.models.parallel_wavegan import PWGGenerator, PWGDiscriminator from parakeet.modules.stft_loss import MultiResolutionSTFTLoss -from parakeet.training.extensions.visualizer import VisualDL -from parakeet.training.extensions.snapshot import Snapshot from parakeet.training.seeding import seed_everything from batch_fn import Clip @@ -66,6 +66,9 @@ def train_sp(args, config): f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", ) + # dataloader has been too verbose + logging.getLogger("DataLoader").disabled = True + # construct dataset for training and validation with jsonlines.open(args.train_metadata, 'r') as reader: train_metadata = list(reader) @@ -106,12 +109,12 @@ def train_sp(args, config): train_dataloader = DataLoader( train_dataset, batch_sampler=train_sampler, - collate_fn=train_batch_fn, # TODO(defaine collate fn) + collate_fn=train_batch_fn, num_workers=config.num_workers) dev_dataloader = DataLoader( dev_dataset, batch_sampler=dev_sampler, - collate_fn=train_batch_fn, # TODO(defaine collate fn) + collate_fn=train_batch_fn, num_workers=config.num_workers) print("dataloaders done!") @@ -191,18 +194,14 @@ def train_sp(args, config): trigger=(config.eval_interval_steps, 'iteration'), priority=3) if dist.get_rank() == 0: - log_writer = LogWriter(str(output_dir)) + writer = LogWriter(str(trainer.out)) + trainer.extend(VisualDL(writer), trigger=(1, 'iteration')) trainer.extend( - VisualDL(log_writer), trigger=(1, 'iteration'), priority=1) - trainer.extend( - Snapshot(checkpoint_dir), - trigger=(config.save_interval_steps, 'iteration'), - priority=2) - print("Trainer Done!") + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) - # with paddle.fluid.profiler.profiler('All', 'total', - # str(output_dir / "profiler.log"), - # 'Default') as prof: + print(trainer.extensions.keys()) + print("Trainer Done!") trainer.run() diff --git a/parakeet/training/extensions/snapshot.py b/parakeet/training/extensions/snapshot.py index 853d62e..a209524 100644 --- a/parakeet/training/extensions/snapshot.py +++ b/parakeet/training/extensions/snapshot.py @@ -13,6 +13,7 @@ # limitations under the License. import os +import logging from pathlib import Path from datetime import datetime from typing import List, Dict, Any @@ -48,25 +49,25 @@ class Snapshot(extension.Extension): trigger = (1, 'epoch') priority = -100 + default_name = "snapshot" def __init__(self, max_size: int=5, snapshot_on_error: bool=False): self.records: List[Dict[str, Any]] = [] self.max_size = max_size self._snapshot_on_error = snapshot_on_error self._save_all = (max_size == -1) - self.save_fn =... - self.del_fn = os.remove self.checkpoint_dir =... def initialize(self, trainer: Trainer): """Setting up this extention.""" - self.save_fn = trainer.updater.save self.checkpoint_dir = trainer.out / "checkpoints" # load existing records - record_path: Path = self.checkpoint_dir / "records.yaml" + record_path: Path = self.checkpoint_dir / "records.jsonl" if record_path.exists(): + logging.debug("Loading from an existing checkpoint dir") self.records = load_records(record_path) + trainer.updater.load(self.records[-1]['path']) def on_error(self, trainer, exc, tb): if self._snapshot_on_error: @@ -87,10 +88,10 @@ class Snapshot(extension.Extension): path = self.checkpoint_dir / f"snapshot_iter_{iteration}.pdz" # add the new one - self.save_fn(path) + trainer.updater.save(path) record = { "time": str(datetime.now()), - 'path': str(path), + 'path': str(path.resolve()), # use absolute path 'iteration': iteration } self.records.append(record) @@ -98,7 +99,7 @@ class Snapshot(extension.Extension): # remove the earist if self.full(): eariest_record = self.records[0] - self.del_fn(eariest_record["path"]) + os.remove(eariest_record["path"]) self.records.pop(0) # update the record file diff --git a/parakeet/training/extensions/visualizer.py b/parakeet/training/extensions/visualizer.py index 2a42ae0..138bf1e 100644 --- a/parakeet/training/extensions/visualizer.py +++ b/parakeet/training/extensions/visualizer.py @@ -29,11 +29,8 @@ class VisualDL(extension.Extension): default_name = 'visualdl' priority = extension.PRIORITY_READER - def __init__(self): - self.writer =... - - def initialize(self, trainer): - self.writer = LogWriter(logdir=str(trainer.out)) + def __init__(self, writer): + self.writer = writer def __call__(self, trainer: Trainer): for k, v in trainer.observation.items(): diff --git a/parakeet/training/trainer.py b/parakeet/training/trainer.py index f4b6fbb..9e90d9a 100644 --- a/parakeet/training/trainer.py +++ b/parakeet/training/trainer.py @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys +import six +import traceback from pathlib import Path from collections import OrderedDict from typing import Callable, Union, List @@ -63,7 +66,7 @@ class Trainer(object): if name is None: name = getattr(extension, 'name', None) if name is None: - name = getattr(extenion, 'default_name', None) + name = getattr(extension, 'default_name', None) if name is None: name = getattr(extension, '__name__', None) if name is None: @@ -112,7 +115,7 @@ class Trainer(object): extensions = [(name, self.extensions[name]) for name in extension_order] - print("initializing") + # initializing all extensions for name, entry in extensions: if hasattr(entry.extension, "initialize"): entry.extension.initialize(self) @@ -120,6 +123,8 @@ class Trainer(object): update = self.updater.update # training step stop_trigger = self.stop_trigger + print(self.updater.state) + # TODO(chenfeiyu): display progress bar correctly # if the trainer is controlled by epoch: use 2 progressbars # if the trainer is controlled by iteration: use 1 progressbar @@ -129,24 +134,50 @@ class Trainer(object): else: max_iteration = self.stop_trigger.period - p = tqdm.tqdm() + p = tqdm.tqdm(initial=self.updater.state.iteration) - while True: - self.observation = {} - # set observation as the report target - # you can use report freely in Updater.update() + try: + while not stop_trigger(self): + self.observation = {} + # set observation as the report target + # you can use report freely in Updater.update() - # updating parameters and state - with scope(self.observation): - update() - p.update() - print(self.observation) + # updating parameters and state + with scope(self.observation): + update() + p.update() - # execute extension when necessary - for name, entry in extensions: - if entry.trigger(self): - entry.extension(self) + # execute extension when necessary + for name, entry in extensions: + if entry.trigger(self): + entry.extension(self) - if stop_trigger(self): - print("Training Done!") - break + # print("###", self.observation) + except Exception as e: + f = sys.stderr + f.write(f"Exception in main training loop: {e}\n") + f.write("Traceback (most recent call last):\n") + traceback.print_tb(sys.exc_info()[2]) + f.write( + "Trainer extensions will try to handle the extension. Then all extensions will finalize." + ) + + # capture the exception in the mian training loop + exc_info = sys.exc_info() + + # try to handle it + for name, entry in extensions: + if hasattr(entry.extension, "on_error"): + try: + entry.extension.on_error(self, e, sys.exc_info()[2]) + except Exception as ee: + f.write(f"Exception in error handler: {ee}\n") + f.write('Traceback (most recent call last):\n') + traceback.print_tb(sys.exc_info()[2]) + + # raise exception in main training loop + six.reraise(*exc_info) + finally: + for name, entry in extensions: + if hasattr(entry.extension, "finalize"): + entry.extension.finalize(self) diff --git a/parakeet/training/updater.py b/parakeet/training/updater.py index 2d9ec3d..5ec5eec 100644 --- a/parakeet/training/updater.py +++ b/parakeet/training/updater.py @@ -87,9 +87,11 @@ class UpdaterBase(object): self.state.iteration = state_dict["iteration"] def save(self, path): + logging.debug(f"Saving to {path}.") archive = self.state_dict() - paddle.save(archive, path) + paddle.save(archive, str(path)) def load(self, path): - archive = paddle.load(path) + logging.debug(f"Loading from {path}.") + archive = paddle.load(str(path)) self.set_state_dict(archive) From ef51e1ab13e2bf0c146fab59200bd1a83070ed1b Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 30 Jun 2021 12:30:14 +0800 Subject: [PATCH 41/47] refined training module --- .../parallelwave_gan/baker/conf/default.yaml | 3 +- .../parallelwave_gan/baker/pwg_updater.py | 18 +----- examples/parallelwave_gan/baker/synthesize.py | 20 ++++-- examples/parallelwave_gan/baker/train.py | 4 +- parakeet/__init__.py | 4 +- parakeet/modules/stft_loss.py | 7 +- parakeet/training/extensions/snapshot.py | 3 +- parakeet/training/trainer.py | 21 +++--- parakeet/training/trigger.py | 2 + .../training/triggers/interval_trigger.py | 12 ++-- parakeet/training/triggers/limit_trigger.py | 31 +++++++++ .../training/updaters/standard_updater.py | 64 +++++++++++++++---- tests/test_optimizer.py | 39 +++++++++++ 13 files changed, 169 insertions(+), 59 deletions(-) create mode 100644 parakeet/training/triggers/limit_trigger.py create mode 100644 tests/test_optimizer.py diff --git a/examples/parallelwave_gan/baker/conf/default.yaml b/examples/parallelwave_gan/baker/conf/default.yaml index ce5b064..877be2c 100644 --- a/examples/parallelwave_gan/baker/conf/default.yaml +++ b/examples/parallelwave_gan/baker/conf/default.yaml @@ -125,4 +125,5 @@ log_interval_steps: 100 # Interval steps to record the training # OTHER SETTING # ########################################################### num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. -num_snapshots: 10 \ No newline at end of file +num_snapshots: 10 +seed: 42 \ No newline at end of file diff --git a/examples/parallelwave_gan/baker/pwg_updater.py b/examples/parallelwave_gan/baker/pwg_updater.py index f7ff916..838416a 100644 --- a/examples/parallelwave_gan/baker/pwg_updater.py +++ b/examples/parallelwave_gan/baker/pwg_updater.py @@ -74,18 +74,15 @@ class PWGUpdater(StandardUpdater): # Generator noise = paddle.randn(wav.shape) - synchronize() with timer() as t: wav_ = self.generator(noise, mel) - synchronize() logging.debug(f"Generator takes {t.elapse}s.") ## Multi-resolution stft loss - synchronize() + with timer() as t: sc_loss, mag_loss = self.criterion_stft( wav_.squeeze(1), wav.squeeze(1)) - synchronize() logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s.") report("train/spectral_convergence_loss", float(sc_loss)) @@ -94,11 +91,9 @@ class PWGUpdater(StandardUpdater): ## Adversarial loss if self.state.iteration > self.discriminator_train_start_steps: - synchronize() with timer() as t: p_ = self.discriminator(wav_) adv_loss = self.criterion_mse(p_, paddle.ones_like(p_)) - synchronize() logging.debug( f"Discriminator and adversarial loss takes {t.elapse}s") report("train/adversarial_loss", float(adv_loss)) @@ -106,18 +101,14 @@ class PWGUpdater(StandardUpdater): report("train/generator_loss", float(gen_loss)) - synchronize() with timer() as t: self.optimizer_g.clear_grad() gen_loss.backward() - synchronize() logging.debug(f"Backward takes {t.elapse}s.") - synchronize() with timer() as t: self.optimizer_g.step() self.scheduler_g.step() - synchronize() logging.debug(f"Update takes {t.elapse}s.") # Disctiminator @@ -158,18 +149,15 @@ class PWGEvaluator(StandardEvaluator): wav, mel = batch noise = paddle.randn(wav.shape) - synchronize() with timer() as t: wav_ = self.generator(noise, mel) - synchronize() logging.debug(f"Generator takes {t.elapse}s") ## Multi-resolution stft loss - synchronize() + with timer() as t: sc_loss, mag_loss = self.criterion_stft( wav_.squeeze(1), wav.squeeze(1)) - synchronize() logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s") report("eval/spectral_convergence_loss", float(sc_loss)) @@ -177,11 +165,9 @@ class PWGEvaluator(StandardEvaluator): gen_loss = sc_loss + mag_loss ## Adversarial loss - synchronize() with timer() as t: p_ = self.discriminator(wav_) adv_loss = self.criterion_mse(p_, paddle.ones_like(p_)) - synchronize() logging.debug( f"Discriminator and adversarial loss takes {t.elapse}s") report("eval/adversarial_loss", float(adv_loss)) diff --git a/examples/parallelwave_gan/baker/synthesize.py b/examples/parallelwave_gan/baker/synthesize.py index 4c4e754..fd84e85 100644 --- a/examples/parallelwave_gan/baker/synthesize.py +++ b/examples/parallelwave_gan/baker/synthesize.py @@ -14,6 +14,7 @@ import os import sys +from timer import timer import logging import argparse from pathlib import Path @@ -25,6 +26,8 @@ import numpy as np import soundfile as sf from paddle import distributed as dist +paddle.set_device("cpu") + from parakeet.datasets.data_table import DataTable from parakeet.models.parallel_wavegan import PWGGenerator @@ -71,11 +74,20 @@ test_dataset = DataTable( output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) +N = 0 +T = 0 for example in test_dataset: utt_id = example['utt_id'] mel = example['feats'] mel = paddle.to_tensor(mel) # (T, C) - wav = generator.inference(c=mel) - wav = wav.numpy() - print(f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}") - sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000) + with timer() as t: + wav = generator.inference(c=mel) + wav = wav.numpy() + N += wav.size + T += t.elapse + speed = wav.size / t.elapse + print( + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {config.sr / speed}." + ) + sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=config.sr) +print(f"generation speed: {N / T}Hz, RTF: {config.sr / (N / T) }") diff --git a/examples/parallelwave_gan/baker/train.py b/examples/parallelwave_gan/baker/train.py index bf8767a..ee012e2 100644 --- a/examples/parallelwave_gan/baker/train.py +++ b/examples/parallelwave_gan/baker/train.py @@ -60,7 +60,7 @@ def train_sp(args, config): paddle.distributed.init_parallel_env() # set the random seed, it is a must for multiprocess training - seed_everything(42) + seed_everything(config.seed) print( f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", @@ -149,6 +149,8 @@ def train_sp(args, config): output_dir = Path(args.output_dir) checkpoint_dir = output_dir / "checkpoints" if dist.get_rank() == 0: + with open(output_dir / "config.yaml", 'wt') as f: + f.write(config.dump(default_flow_style=None)) output_dir.mkdir(parents=True, exist_ok=True) checkpoint_dir.mkdir(parents=True, exist_ok=True) diff --git a/parakeet/__init__.py b/parakeet/__init__.py index cce4fff..f08f907 100644 --- a/parakeet/__init__.py +++ b/parakeet/__init__.py @@ -12,9 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.2.0-beta.0" +__version__ = "0.0.0" import logging from parakeet import audio, data, datasets, frontend, models, modules, training, utils - -logging.getLogger('parakeet').addHandler(logging.NullHandler()) diff --git a/parakeet/modules/stft_loss.py b/parakeet/modules/stft_loss.py index f98a3dd..cdc066f 100644 --- a/parakeet/modules/stft_loss.py +++ b/parakeet/modules/stft_loss.py @@ -43,9 +43,10 @@ class SpectralConvergenceLoss(nn.Layer): class LogSTFTMagnitudeLoss(nn.Layer): """Log STFT magnitude loss module.""" - def __init__(self): + def __init__(self, epsilon=1e-10): """Initilize los STFT magnitude loss module.""" super().__init__() + self.epsilon = epsilon def forward(self, x_mag, y_mag): """Calculate forward propagation. @@ -57,9 +58,9 @@ class LogSTFTMagnitudeLoss(nn.Layer): """ return F.l1_loss( paddle.log(paddle.clip( - y_mag, min=1e-10)), + y_mag, min=self.epsilon)), paddle.log(paddle.clip( - x_mag, min=1e-10))) + x_mag, min=self.epsilon))) class STFTLoss(nn.Layer): diff --git a/parakeet/training/extensions/snapshot.py b/parakeet/training/extensions/snapshot.py index a209524..92d74ef 100644 --- a/parakeet/training/extensions/snapshot.py +++ b/parakeet/training/extensions/snapshot.py @@ -106,4 +106,5 @@ class Snapshot(extension.Extension): record_path = self.checkpoint_dir / "records.jsonl" with jsonlines.open(record_path, 'w') as writer: for record in self.records: - writer.write(record) + # jsonlines.open may return a Writer or a Reader + writer.write(record) # pylint: disable=no-member diff --git a/parakeet/training/trainer.py b/parakeet/training/trainer.py index 9e90d9a..484845f 100644 --- a/parakeet/training/trainer.py +++ b/parakeet/training/trainer.py @@ -21,7 +21,7 @@ from typing import Callable, Union, List import tqdm -from parakeet.training.trigger import get_trigger, IntervalTrigger +from parakeet.training.trigger import get_trigger, IntervalTrigger, LimitTrigger from parakeet.training.updater import UpdaterBase from parakeet.training.reporter import scope from parakeet.training.extension import Extension, PRIORITY_READER @@ -42,7 +42,7 @@ class Trainer(object): extensions: List[Extension]=None): self.updater = updater self.extensions = OrderedDict() - self.stop_trigger = get_trigger(stop_trigger) + self.stop_trigger = LimitTrigger(*stop_trigger) self.out = Path(out) self.observation =... @@ -125,16 +125,19 @@ class Trainer(object): print(self.updater.state) - # TODO(chenfeiyu): display progress bar correctly - # if the trainer is controlled by epoch: use 2 progressbars - # if the trainer is controlled by iteration: use 1 progressbar - if isinstance(stop_trigger, IntervalTrigger): + # display only one progress bar + max_iteration = None + if isinstance(stop_trigger, LimitTrigger): if stop_trigger.unit is 'epoch': - max_epoch = self.stop_trigger.period + max_epoch = self.stop_trigger.limit + updates_per_epoch = getattr(self.updater, "updates_per_epoch", + None) + max_iteration = max_epoch * updates_per_epoch if updates_per_epoch else None else: - max_iteration = self.stop_trigger.period + max_iteration = self.stop_trigger.limit - p = tqdm.tqdm(initial=self.updater.state.iteration) + p = tqdm.tqdm( + initial=self.updater.state.iteration, total=max_iteration) try: while not stop_trigger(self): diff --git a/parakeet/training/trigger.py b/parakeet/training/trigger.py index f5834bc..b588512 100644 --- a/parakeet/training/trigger.py +++ b/parakeet/training/trigger.py @@ -13,6 +13,8 @@ # limitations under the License. from parakeet.training.triggers.interval_trigger import IntervalTrigger +from parakeet.training.triggers.limit_trigger import LimitTrigger +from parakeet.training.triggers.time_trigger import TimeTrigger def never_file_trigger(trainer): diff --git a/parakeet/training/triggers/interval_trigger.py b/parakeet/training/triggers/interval_trigger.py index 82f441e..b88816c 100644 --- a/parakeet/training/triggers/interval_trigger.py +++ b/parakeet/training/triggers/interval_trigger.py @@ -19,17 +19,13 @@ class IntervalTrigger(object): def __init__(self, period: int, unit: str): if unit not in ("iteration", "epoch"): raise ValueError("unit should be 'iteration' or 'epoch'") + if period <= 0: + raise ValueError("period should be a positive integer.") self.period = period self.unit = unit def __call__(self, trainer): state = trainer.updater.state - # we use a special scheme so we can use iteration % period == 0 as - # the predicate - # increase the iteration then update parameters - # instead of updating then increase iteration - if self.unit == "epoch": - fire = state.epoch % self.period == 0 - else: - fire = state.iteration % self.period == 0 + index = getattr(state, self.unit) + fire = index % self.period == 0 return fire diff --git a/parakeet/training/triggers/limit_trigger.py b/parakeet/training/triggers/limit_trigger.py new file mode 100644 index 0000000..dd7a135 --- /dev/null +++ b/parakeet/training/triggers/limit_trigger.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class LimitTrigger(object): + """A Predicate to decide whether to stop.""" + + def __init__(self, limit: int, unit: str): + if unit not in ("iteration", "epoch"): + raise ValueError("unit should be 'iteration' or 'epoch'") + if limit <= 0: + raise ValueError("limit should be a positive integer.") + self.limit = limit + self.unit = unit + + def __call__(self, trainer): + state = trainer.updater.state + index = getattr(state, self.unit) + fire = index >= self.limit + return fire diff --git a/parakeet/training/updaters/standard_updater.py b/parakeet/training/updaters/standard_updater.py index 5cc0252..e39b758 100644 --- a/parakeet/training/updaters/standard_updater.py +++ b/parakeet/training/updaters/standard_updater.py @@ -62,7 +62,40 @@ class StandardUpdater(UpdaterBase): self.train_iterator = iter(dataloader) def update(self): - self.state.iteration += 1 + # We increase the iteration index after updating and before extension. + # Here are the reasons. + + # 0. Snapshotting(as well as other extensions, like visualizer) is + # executed after a step of updating; + # 1. We decide to increase the iteration index after updating and + # before any all extension is executed. + # 3. We do not increase the iteration after extension because we + # prefer a consistent resume behavior, when load from a + # `snapshot_iter_100.pdz` then the next step to train is `101`, + # naturally. But if iteration is increased increased after + # extension(including snapshot), then, a `snapshot_iter_99` is + # loaded. You would need a extra increasing of the iteration idex + # before training to avoid another iteration `99`, which has been + # done before snapshotting. + # 4. Thus iteration index represrnts "currently how mant epochs has + # been done." + # NOTE: use report to capture the correctly value. If you want to + # report the learning rate used for a step, you must report it before + # the learning rate scheduler's step() has been called. In paddle's + # convention, we do not use an extension to change the learning rate. + # so if you want to report it, do it in the updater. + + # Then here comes the next question. When is the proper time to + # increase the epoch index? Since all extensions are executed after + # updating, it is the time that after updating is the proper time to + # increase epoch index. + # 1. If we increase the epoch index before updating, then an extension + # based ot epoch would miss the correct timing. It could only be + # triggerd after an extra updating. + # 2. Theoretically, when an epoch is done, the epoch index should be + # increased. So it would be increase after updating. + # 3. Thus, eppoch index represents "currently how many epochs has been + # done." So it starts from 0. # switch to training mode for layer in self.models.values(): @@ -72,6 +105,11 @@ class StandardUpdater(UpdaterBase): batch = self.read_batch() self.update_core(batch) + self.state.iteration += 1 + if self.updaters_per_epoch is not None: + if self.state.iteration % self.updaters_per_epoch == 0: + self.state.epoch += 1 + def update_core(self, batch): """A simple case for a training step. Basic assumptions are: Single model; @@ -100,10 +138,20 @@ class StandardUpdater(UpdaterBase): loss_dict["main"].backward() self.optimizer.update() + @property + def updaters_per_epoch(self): + """Number of updater per epoch, determined by the length of the + dataloader.""" + length_of_dataloader = None + try: + length_of_dataloader = len(self.dataloader) + except TypeError: + logging.debug("This dataloader has no __len__.") + finally: + return length_of_dataloader + def new_epoch(self): """Start a new epoch.""" - self.state.epoch += 1 - # NOTE: all batch sampler for distributed training should # subclass DistributedBatchSampler and implement `set_epoch` method batch_sampler = self.dataloader.batch_sampler @@ -140,13 +188,3 @@ class StandardUpdater(UpdaterBase): for name, optim in self.optimizers.items(): optim.set_state_dict(state_dict[f"{name}_optimizer"]) super().set_state_dict(state_dict) - - def save(self, path): - """Save Updater state dict.""" - archive = self.state_dict() - paddle.save(archive, path) - - def load(self, path): - """Load Updater state dict.""" - archive = paddle.load(path) - self.set_state_dict(archive) diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py new file mode 100644 index 0000000..bdb3d96 --- /dev/null +++ b/tests/test_optimizer.py @@ -0,0 +1,39 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +from pathlib import Path + +import paddle +from paddle import nn +from paddle.optimizer import Adam +from paddle.optimizer.lr import StepDecay + + +def test_optimizer(): + model1 = nn.Linear(3, 4) + optim1 = Adam( + parameters=model1.parameters(), learning_rate=StepDecay(0.1, 100)) + + output_dir = Path("temp_test_optimizer") + shutil.rmtree(output_dir, ignore_errors=True) + output_dir.mkdir(exist_ok=True, parents=True) + + # model1.set_state_dict(model1.state_dict()) + optim1.set_state_dict(optim1.state_dict()) + + x = paddle.randn([6, 3]) + y = model1(x).sum() + y.backward() + optim1.step() From af26c1e3899e2d598854f5c9ac37a2cc6e17c915 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 30 Jun 2021 13:33:56 +0800 Subject: [PATCH 42/47] fix priority in example/parallelwavegan --- examples/parallelwave_gan/baker/train.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/parallelwave_gan/baker/train.py b/examples/parallelwave_gan/baker/train.py index ee012e2..af787bb 100644 --- a/examples/parallelwave_gan/baker/train.py +++ b/examples/parallelwave_gan/baker/train.py @@ -149,10 +149,10 @@ def train_sp(args, config): output_dir = Path(args.output_dir) checkpoint_dir = output_dir / "checkpoints" if dist.get_rank() == 0: - with open(output_dir / "config.yaml", 'wt') as f: - f.write(config.dump(default_flow_style=None)) output_dir.mkdir(parents=True, exist_ok=True) checkpoint_dir.mkdir(parents=True, exist_ok=True) + with open(output_dir / "config.yaml", 'wt') as f: + f.write(config.dump(default_flow_style=None)) updater = PWGUpdater( models={ @@ -192,9 +192,7 @@ def train_sp(args, config): out=output_dir, ) trainer.extend( - evaluator, - trigger=(config.eval_interval_steps, 'iteration'), - priority=3) + evaluator, trigger=(config.eval_interval_steps, 'iteration')) if dist.get_rank() == 0: writer = LogWriter(str(trainer.out)) trainer.extend(VisualDL(writer), trigger=(1, 'iteration')) From a93fad051c37df8548011d5153ba9bbf854d4c2a Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 30 Jun 2021 13:50:41 +0800 Subject: [PATCH 43/47] auto choose device for inference --- examples/parallelwave_gan/baker/synthesize.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/parallelwave_gan/baker/synthesize.py b/examples/parallelwave_gan/baker/synthesize.py index fd84e85..b805d60 100644 --- a/examples/parallelwave_gan/baker/synthesize.py +++ b/examples/parallelwave_gan/baker/synthesize.py @@ -26,8 +26,6 @@ import numpy as np import soundfile as sf from paddle import distributed as dist -paddle.set_device("cpu") - from parakeet.datasets.data_table import DataTable from parakeet.models.parallel_wavegan import PWGGenerator @@ -37,7 +35,7 @@ parser = argparse.ArgumentParser( description="synthesize with parallel wavegan.") parser.add_argument( "--config", type=str, help="config file to overwrite default config") -parser.add_argument("--params", type=str, help="generator parameter file") +parser.add_argument("--checkpoint", type=str, help="snapshot to load") parser.add_argument("--test-metadata", type=str, help="dev data") parser.add_argument("--output-dir", type=str, help="output dir") parser.add_argument("--verbose", type=int, default=1, help="verbose") @@ -47,6 +45,11 @@ config = get_cfg_default() if args.config: config.merge_from_file(args.config) +if not paddle.is_compiled_with_cuda: + paddle.set_device("cpu") +else: + paddle.set_device("gpu:0") + print("========Args========") print(yaml.safe_dump(vars(args))) print("========Config========") @@ -56,8 +59,8 @@ print( ) generator = PWGGenerator(**config["generator_params"]) -state_dict = paddle.load(args.params) -generator.set_state_dict(state_dict) +state_dict = paddle.load(args.checkpoint) +generator.set_state_dict(state_dict["generator_params"]) generator.remove_weight_norm() generator.eval() From dd6772bc3e9e269367f0a6ac21ae2a162472c67e Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 30 Jun 2021 13:57:05 +0800 Subject: [PATCH 44/47] add --device to cli argument, use gpu by default --- examples/parallelwave_gan/baker/synthesize.py | 7 ++----- examples/parallelwave_gan/baker/train.py | 4 ++++ 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/parallelwave_gan/baker/synthesize.py b/examples/parallelwave_gan/baker/synthesize.py index b805d60..01cfbbf 100644 --- a/examples/parallelwave_gan/baker/synthesize.py +++ b/examples/parallelwave_gan/baker/synthesize.py @@ -38,6 +38,7 @@ parser.add_argument( parser.add_argument("--checkpoint", type=str, help="snapshot to load") parser.add_argument("--test-metadata", type=str, help="dev data") parser.add_argument("--output-dir", type=str, help="output dir") +parser.add_argument("--device", type=str, default="gpu", help="device to run") parser.add_argument("--verbose", type=int, default=1, help="verbose") args = parser.parse_args() @@ -45,11 +46,6 @@ config = get_cfg_default() if args.config: config.merge_from_file(args.config) -if not paddle.is_compiled_with_cuda: - paddle.set_device("cpu") -else: - paddle.set_device("gpu:0") - print("========Args========") print(yaml.safe_dump(vars(args))) print("========Config========") @@ -58,6 +54,7 @@ print( f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" ) +paddle.set_device(args.device) generator = PWGGenerator(**config["generator_params"]) state_dict = paddle.load(args.checkpoint) generator.set_state_dict(state_dict["generator_params"]) diff --git a/examples/parallelwave_gan/baker/train.py b/examples/parallelwave_gan/baker/train.py index af787bb..3699e6f 100644 --- a/examples/parallelwave_gan/baker/train.py +++ b/examples/parallelwave_gan/baker/train.py @@ -214,11 +214,15 @@ def main(): parser.add_argument("--train-metadata", type=str, help="training data") parser.add_argument("--dev-metadata", type=str, help="dev data") parser.add_argument("--output-dir", type=str, help="output dir") + parser.add_argument( + "--device", type=str, default="gpu", help="device type to use") parser.add_argument( "--nprocs", type=int, default=1, help="number of processes") parser.add_argument("--verbose", type=int, default=1, help="verbose") args = parser.parse_args() + if args.device == "cpu" and args.nprocs > 1: + raise RuntimeError("Multiprocess training on CPU is not supported.") config = get_cfg_default() if args.config: config.merge_from_file(args.config) From afe9d4a4f16f8cdd54c2d61afec4c6696931e6a4 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 30 Jun 2021 14:20:06 +0800 Subject: [PATCH 45/47] delete config for paralle training --- .../parallelwave_gan/baker/conf/parallel.yaml | 127 ------------------ 1 file changed, 127 deletions(-) delete mode 100644 examples/parallelwave_gan/baker/conf/parallel.yaml diff --git a/examples/parallelwave_gan/baker/conf/parallel.yaml b/examples/parallelwave_gan/baker/conf/parallel.yaml deleted file mode 100644 index f23b243..0000000 --- a/examples/parallelwave_gan/baker/conf/parallel.yaml +++ /dev/null @@ -1,127 +0,0 @@ -# This is the hyperparameter configuration file for Parallel WaveGAN. -# Please make sure this is adjusted for the CSMSC dataset. If you want to -# apply to the other dataset, you might need to carefully change some parameters. -# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN. - -########################################################### -# FEATURE EXTRACTION SETTING # -########################################################### -sr: 24000 # Sampling rate. -n_fft: 2048 # FFT size. -hop_length: 300 # Hop size. -win_length: 1200 # Window length. - # If set to null, it will be the same as fft_size. -window: "hann" # Window function. -n_mels: 80 # Number of mel basis. -fmin: 80 # Minimum freq in mel basis calculation. -fmax: 7600 # Maximum frequency in mel basis calculation. -# global_gain_scale: 1.0 # Will be multiplied to all of waveform. -trim_silence: false # Whether to trim the start and end of silence. -top_db: 60 # Need to tune carefully if the recording is not good. -trim_frame_length: 2048 # Frame size in trimming. -trim_hop_length: 512 # Hop size in trimming. -# format: "npy" # Feature file format. "npy" or "hdf5" is supported. - -########################################################### -# GENERATOR NETWORK ARCHITECTURE SETTING # -########################################################### -generator_params: - in_channels: 1 # Number of input channels. - out_channels: 1 # Number of output channels. - kernel_size: 3 # Kernel size of dilated convolution. - layers: 30 # Number of residual block layers. - stacks: 3 # Number of stacks i.e., dilation cycles. - residual_channels: 64 # Number of channels in residual conv. - gate_channels: 128 # Number of channels in gated conv. - skip_channels: 64 # Number of channels in skip conv. - aux_channels: 80 # Number of channels for auxiliary feature conv. - # Must be the same as num_mels. - aux_context_window: 2 # Context window size for auxiliary feature. - # If set to 2, previous 2 and future 2 frames will be considered. - dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - bias: true # use bias in residual blocks - use_weight_norm: true # Whether to use weight norm. - # If set to true, it will be applied to all of the conv layers. - use_causal_conv: false # use causal conv in residual blocks and upsample layers - # upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture. - upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size. - interpolate_mode: "nearest" # upsample net interpolate mode - freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis - nonlinear_activation: null - nonlinear_activation_params: {} - -########################################################### -# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # -########################################################### -discriminator_params: - in_channels: 1 # Number of input channels. - out_channels: 1 # Number of output channels. - kernel_size: 3 # Number of output channels. - layers: 10 # Number of conv layers. - conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. - # If set to true, it will be applied to all of the conv layers. - nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv. - nonlinear_activation_params: # Nonlinear function parameters - negative_slope: 0.2 # Alpha in LeakyReLU. - -########################################################### -# STFT LOSS SETTING # -########################################################### -stft_loss_params: - fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. - hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss - win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. - window: "hann" # Window function for STFT-based loss - -########################################################### -# ADVERSARIAL LOSS SETTING # -########################################################### -lambda_adv: 4.0 # Loss balancing coefficient. - -########################################################### -# DATA LOADER SETTING # -########################################################### -batch_size: 6 # Batch size. -batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size. -pin_memory: true # Whether to pin memory in Pytorch DataLoader. -num_workers: 4 # Number of workers in Pytorch DataLoader. -remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. -allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. - -########################################################### -# OPTIMIZER & SCHEDULER SETTING # -########################################################### -generator_optimizer_params: - epsilon: 1.0e-6 # Generator's epsilon. - weight_decay: 0.0 # Generator's weight decay coefficient. -generator_scheduler_params: - learning_rate: 0.0001 # Generator's learning rate. - step_size: 100000 # Generator's scheduler step size. - gamma: 0.5 # Generator's scheduler gamma. - # At each step size, lr will be multiplied by this parameter. -generator_grad_norm: 10 # Generator's gradient norm. -discriminator_optimizer_params: - epsilon: 1.0e-6 # Discriminator's epsilon. - weight_decay: 0.0 # Discriminator's weight decay coefficient. -discriminator_scheduler_params: - learning_rate: 0.00005 # Discriminator's learning rate. - step_size: 100000 # Discriminator's scheduler step size. - gamma: 0.5 # Discriminator's scheduler gamma. - # At each step size, lr will be multiplied by this parameter. -discriminator_grad_norm: 1 # Discriminator's gradient norm. - -########################################################### -# INTERVAL SETTING # -########################################################### -discriminator_train_start_steps: 50000 # Number of steps to start to train discriminator. -train_max_steps: 400000 # Number of training steps. -save_interval_steps: 5000 # Interval steps to save checkpoint. -eval_interval_steps: 1000 # Interval steps to evaluate the network. -log_interval_steps: 100 # Interval steps to record the training log. - -########################################################### -# OTHER SETTING # -########################################################### -num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. From 3ebed00c964185de3dee32206071a56bdef0fb29 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 1 Jul 2021 16:14:55 +0800 Subject: [PATCH 46/47] minor fixes to refine code. --- examples/parallelwave_gan/baker/batch_fn.py | 2 +- .../baker/compute_statistics.py | 4 +++- .../parallelwave_gan/baker/conf/default.yaml | 11 +++++----- examples/parallelwave_gan/baker/preprocess.py | 22 ++++++++++++------- 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/examples/parallelwave_gan/baker/batch_fn.py b/examples/parallelwave_gan/baker/batch_fn.py index aff647c..22af5af 100644 --- a/examples/parallelwave_gan/baker/batch_fn.py +++ b/examples/parallelwave_gan/baker/batch_fn.py @@ -25,7 +25,7 @@ class Clip(object): batch_max_steps=20480, hop_size=256, aux_context_window=0, ): - """Initialize customized collater for PyTorch DataLoader. + """Initialize customized collater for DataLoader. Args: batch_max_steps (int): The maximum length of input signal in batch. diff --git a/examples/parallelwave_gan/baker/compute_statistics.py b/examples/parallelwave_gan/baker/compute_statistics.py index 4db003f..06b9b65 100644 --- a/examples/parallelwave_gan/baker/compute_statistics.py +++ b/examples/parallelwave_gan/baker/compute_statistics.py @@ -39,7 +39,9 @@ def main(): parser.add_argument( "--metadata", type=str, help="json file with id and file paths ") parser.add_argument( - "--field-name", type=str, help="json file with id and file paths ") + "--field-name", + type=str, + help="name of the field to compute statistics for.") parser.add_argument( "--config", type=str, help="yaml format configuration file.") parser.add_argument( diff --git a/examples/parallelwave_gan/baker/conf/default.yaml b/examples/parallelwave_gan/baker/conf/default.yaml index 877be2c..777b2b0 100644 --- a/examples/parallelwave_gan/baker/conf/default.yaml +++ b/examples/parallelwave_gan/baker/conf/default.yaml @@ -18,9 +18,8 @@ fmax: 7600 # Maximum frequency in mel basis calculation. # global_gain_scale: 1.0 # Will be multiplied to all of waveform. trim_silence: false # Whether to trim the start and end of silence. top_db: 60 # Need to tune carefully if the recording is not good. -trim_frame_length: 2048 # Frame size in trimming. -trim_hop_length: 512 # Hop size in trimming. -# format: "npy" # Feature file format. "npy" or "hdf5" is supported. +trim_frame_length: 2048 # Frame size in trimming.(in samples) +trim_hop_length: 512 # Hop size in trimming.(in samples) ########################################################### # GENERATOR NETWORK ARCHITECTURE SETTING # @@ -119,11 +118,11 @@ discriminator_train_start_steps: 100000 # Number of steps to start to train disc train_max_steps: 400000 # Number of training steps. save_interval_steps: 5000 # Interval steps to save checkpoint. eval_interval_steps: 1000 # Interval steps to evaluate the network. -log_interval_steps: 100 # Interval steps to record the training log. + ########################################################### # OTHER SETTING # ########################################################### num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. -num_snapshots: 10 -seed: 42 \ No newline at end of file +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random \ No newline at end of file diff --git a/examples/parallelwave_gan/baker/preprocess.py b/examples/parallelwave_gan/baker/preprocess.py index 3ae461b..6144c34 100644 --- a/examples/parallelwave_gan/baker/preprocess.py +++ b/examples/parallelwave_gan/baker/preprocess.py @@ -147,8 +147,11 @@ def process_sentence(config: Dict[str, Any], # adjust time to make num_samples == num_frames * hop_length num_frames = logmel.shape[1] - y = np.pad(y, (0, config.n_fft), mode="reflect") - y = y[:num_frames * config.hop_length] + if y.size < num_frames * config.hop_length: + y = np.pad(y, (0, num_frames * config.hop_length - y.size), + mode="reflect") + else: + y = y[:num_frames * config.hop_length] num_sample = y.shape[0] mel_path = output_dir / (utt_id + "_feats.npy") @@ -241,13 +244,16 @@ def main(): list((root_dir / "PhoneLabeling").rglob("*.interval"))) # split data into 3 sections - train_wav_files = wav_files[:9800] - dev_wav_files = wav_files[9800:9900] - test_wav_files = wav_files[9900:] + num_train = 9800 + num_dev = 100 - train_alignment_files = alignment_files[:9800] - dev_alignment_files = alignment_files[9800:9900] - test_alignment_files = alignment_files[9900:] + train_wav_files = wav_files[:num_train] + dev_wav_files = wav_files[num_train:num_train + num_dev] + test_wav_files = wav_files[num_train + num_dev:] + + train_alignment_files = alignment_files[:num_train] + dev_alignment_files = alignment_files[num_train:num_train + num_dev] + test_alignment_files = alignment_files[num_train + num_dev:] train_dump_dir = dumpdir / "train" / "raw" train_dump_dir.mkdir(parents=True, exist_ok=True) From e41423caf08b06b77d5ec6a54af5811bbf6d79c9 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 1 Jul 2021 16:48:34 +0800 Subject: [PATCH 47/47] avoid duplicated computation in validation, compute adversarial before stft loss. --- .../parallelwave_gan/baker/pwg_updater.py | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/examples/parallelwave_gan/baker/pwg_updater.py b/examples/parallelwave_gan/baker/pwg_updater.py index 838416a..dde7773 100644 --- a/examples/parallelwave_gan/baker/pwg_updater.py +++ b/examples/parallelwave_gan/baker/pwg_updater.py @@ -153,17 +153,6 @@ class PWGEvaluator(StandardEvaluator): wav_ = self.generator(noise, mel) logging.debug(f"Generator takes {t.elapse}s") - ## Multi-resolution stft loss - - with timer() as t: - sc_loss, mag_loss = self.criterion_stft( - wav_.squeeze(1), wav.squeeze(1)) - logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s") - - report("eval/spectral_convergence_loss", float(sc_loss)) - report("eval/log_stft_magnitude_loss", float(mag_loss)) - gen_loss = sc_loss + mag_loss - ## Adversarial loss with timer() as t: p_ = self.discriminator(wav_) @@ -171,15 +160,22 @@ class PWGEvaluator(StandardEvaluator): logging.debug( f"Discriminator and adversarial loss takes {t.elapse}s") report("eval/adversarial_loss", float(adv_loss)) - gen_loss += self.lambda_adv * adv_loss + gen_loss = self.lambda_adv * adv_loss + + # stft loss + with timer() as t: + sc_loss, mag_loss = self.criterion_stft( + wav_.squeeze(1), wav.squeeze(1)) + logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s") + + report("eval/spectral_convergence_loss", float(sc_loss)) + report("eval/log_stft_magnitude_loss", float(mag_loss)) + gen_loss += sc_loss + mag_loss report("eval/generator_loss", float(gen_loss)) # Disctiminator - with paddle.no_grad(): - wav_ = self.generator(noise, mel) p = self.discriminator(wav) - p_ = self.discriminator(wav_.detach()) real_loss = self.criterion_mse(p, paddle.ones_like(p)) fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_)) report("eval/real_loss", float(real_loss))