From c866bb0b57860b8c15fff2466a067faeddfa9c32 Mon Sep 17 00:00:00 2001 From: iclementine Date: Sun, 20 Dec 2020 13:11:54 +0800 Subject: [PATCH 1/3] discard tests/ temporarily for outdated code --- tests/test_attention.py | 101 ------------------------- tests/test_cbhg.py | 34 --------- tests/test_clarinet.py | 43 ----------- tests/test_connections.py | 33 -------- tests/test_conv.py | 67 ---------------- tests/test_dataset.py | 122 ------------------------------ tests/test_deepvoice3.py | 107 -------------------------- tests/test_geometry.py | 19 ----- tests/test_losses.py | 33 -------- tests/test_masking.py | 54 ------------- tests/test_position_encoding.py | 64 ---------------- tests/test_stft.py | 27 ------- tests/test_transformer.py | 43 ----------- tests/test_transformer_tts.py | 121 ----------------------------- tests/test_waveflow.py | 130 -------------------------------- 15 files changed, 998 deletions(-) delete mode 100644 tests/test_attention.py delete mode 100644 tests/test_cbhg.py delete mode 100644 tests/test_clarinet.py delete mode 100644 tests/test_connections.py delete mode 100644 tests/test_conv.py delete mode 100644 tests/test_dataset.py delete mode 100644 tests/test_deepvoice3.py delete mode 100644 tests/test_geometry.py delete mode 100644 tests/test_losses.py delete mode 100644 tests/test_masking.py delete mode 100644 tests/test_position_encoding.py delete mode 100644 tests/test_stft.py delete mode 100644 tests/test_transformer.py delete mode 100644 tests/test_transformer_tts.py delete mode 100644 tests/test_waveflow.py diff --git a/tests/test_attention.py b/tests/test_attention.py deleted file mode 100644 index 7865b68..0000000 --- a/tests/test_attention.py +++ /dev/null @@ -1,101 +0,0 @@ -import unittest -import numpy as np -import paddle -paddle.set_default_dtype("float64") -paddle.disable_static(paddle.CPUPlace()) - -from parakeet.modules import attention as attn - -class TestScaledDotProductAttention(unittest.TestCase): - def test_without_mask(self): - x = paddle.randn([4, 16, 8]) - context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x) - assert(list(context_vector.shape) == [4, 16, 8]) - assert(list(attention_weights.shape) == [4, 16, 16]) - - def test_with_mask(self): - x = paddle.randn([4, 16, 8]) - mask = paddle.fluid.layers.sequence_mask( - paddle.to_tensor([16, 15, 13, 14]), dtype=x.dtype) - mask = mask.unsqueeze(1) # unsqueeze for the decoder time steps - context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x, mask) - assert(list(context_vector.shape) == [4, 16, 8]) - assert(list(attention_weights.shape) == [4, 16, 16]) - - def test_4d(self): - x = paddle.randn([4, 6, 16, 8]) - context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x) - assert(list(context_vector.shape) == [4, 6, 16, 8]) - assert(list(attention_weights.shape) == [4, 6, 16, 16]) - - -class TestMonoheadAttention(unittest.TestCase): - def test_io(self): - net = attn.MonoheadAttention(6, 0.1) - q = paddle.randn([4, 18, 6]) - k = paddle.randn([4, 12, 6]) - v = paddle.randn([4, 12, 6]) - mask = paddle.fluid.layers.sequence_mask( - paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype) - mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q - context_vector, attn_weights = net(q, k, v, mask) - self.assertTupleEqual(context_vector.numpy().shape, (4, 18, 6)) - self.assertTupleEqual(attn_weights.numpy().shape, (4, 18, 12)) - - -class TestDropHead(unittest.TestCase): - def test_drop(self): - x = paddle.randn([4, 6, 16, 8]) - out = attn.drop_head(x, 2, training=True) - # drop 2 head from 6 at all positions - np.testing.assert_allclose(np.sum(out.numpy() == 0., axis=1), 2) - - def test_drop_all(self): - x = paddle.randn([4, 6, 16, 8]) - out = attn.drop_head(x, 6, training=True) - np.testing.assert_allclose(np.sum(out.numpy()), 0) - - def test_eval(self): - x = paddle.randn([4, 6, 16, 8]) - out = attn.drop_head(x, 6, training=False) - self.assertIs(x, out) - - -class TestMultiheadAttention(unittest.TestCase): - def __init__(self, methodName="test_io", same_qk=True): - super(TestMultiheadAttention, self).__init__(methodName) - self.same_qk = same_qk - - def setUp(self): - if self.same_qk: - net = attn.MultiheadAttention(64, 8, dropout=0.3) - else: - net = attn.MultiheadAttention(64, 8, k_dim=12, v_dim=6) - self.net =net - - def test_io(self): - q = paddle.randn([4, 12, 64]) - mask = paddle.fluid.layers.sequence_mask( - paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype) - mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q - context_vector, attention_weights = self.net(q, q, q, mask) - self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64)) - self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12)) - - -def load_tests(loader, standard_tests, pattern): - suite = unittest.TestSuite() - suite.addTest(TestScaledDotProductAttention("test_without_mask")) - suite.addTest(TestScaledDotProductAttention("test_with_mask")) - suite.addTest(TestScaledDotProductAttention("test_4d")) - - suite.addTest(TestDropHead("test_drop")) - suite.addTest(TestDropHead("test_drop_all")) - suite.addTest(TestDropHead("test_eval")) - - suite.addTest(TestMonoheadAttention("test_io")) - - suite.addTest(TestMultiheadAttention("test_io", same_qk=True)) - suite.addTest(TestMultiheadAttention("test_io", same_qk=False)) - - return suite \ No newline at end of file diff --git a/tests/test_cbhg.py b/tests/test_cbhg.py deleted file mode 100644 index 08ccbcc..0000000 --- a/tests/test_cbhg.py +++ /dev/null @@ -1,34 +0,0 @@ -import unittest -import paddle -paddle.set_default_dtype("float64") -paddle.disable_static(paddle.CPUPlace()) -from parakeet.modules import cbhg - - -class TestHighway(unittest.TestCase): - def test_io(self): - net = cbhg.Highway(4) - x = paddle.randn([2, 12, 4]) - y = net(x) - self.assertTupleEqual(y.numpy().shape, (2, 12, 4)) - - -class TestCBHG(unittest.TestCase): - def __init__(self, methodName="runTest", ): - super(TestCBHG, self).__init__(methodName) - - def test_io(self): - self.net = cbhg.CBHG(64, 32, 16, - projection_channels=[64, 128], - num_highways=4, highway_features=128, - gru_features=64) - x = paddle.randn([4, 64, 32]) - y = self.net(x) - self.assertTupleEqual(y.numpy().shape, (4, 32, 128)) - -def load_tests(loader, standard_tests, pattern): - suite = unittest.TestSuite() - - suite.addTest(TestHighway("test_io")) - suite.addTest(TestCBHG("test_io")) - return suite diff --git a/tests/test_clarinet.py b/tests/test_clarinet.py deleted file mode 100644 index 32e8bff..0000000 --- a/tests/test_clarinet.py +++ /dev/null @@ -1,43 +0,0 @@ -import unittest -import numpy as np - -import paddle -paddle.set_default_dtype("float64") -paddle.disable_static(paddle.CPUPlace()) - -from parakeet.models import clarinet -from parakeet.modules import stft - -class TestParallelWaveNet(unittest.TestCase): - def test_io(self): - net = clarinet.ParallelWaveNet([8, 8, 8], [1, 1, 1], 16, 12, 2) - x = paddle.randn([4, 6073]) - condition = paddle.randn([4, 12, 6073]) - z, out_mu, out_log_std = net(x, condition) - self.assertTupleEqual(z.numpy().shape, (4, 6073)) - self.assertTupleEqual(out_mu.numpy().shape, (4, 6073)) - self.assertTupleEqual(out_log_std.numpy().shape, (4, 6073)) - - -class TestClariNet(unittest.TestCase): - def setUp(self): - encoder = clarinet.UpsampleNet([2, 2]) - teacher = clarinet.WaveNet(8, 3, 16, 3, 12, 2, "mog", -9.0) - student = clarinet.ParallelWaveNet([8, 8, 8, 8, 8, 8], [1, 1, 1, 1, 1, 1], 16, 12, 2) - stft_module = stft.STFT(16, 4, 8) - net = clarinet.Clarinet(encoder, teacher, student, stft_module, -6.0, lmd=4) - print("context size is: ", teacher.context_size) - self.net = net - - def test_io(self): - audio = paddle.randn([4, 1366]) - mel = paddle.randn([4, 12, 512]) # 512 * 4 =2048 - audio_start = paddle.zeros([4], dtype="int64") - loss = self.net(audio, mel, audio_start, clip_kl=True) - loss["loss"].numpy() - - def test_synthesis(self): - mel = paddle.randn([4, 12, 512]) # 64 = 246 / 4 - out = self.net.synthesis(mel) - self.assertTupleEqual(out.numpy().shape, (4, 2048)) - \ No newline at end of file diff --git a/tests/test_connections.py b/tests/test_connections.py deleted file mode 100644 index be0401a..0000000 --- a/tests/test_connections.py +++ /dev/null @@ -1,33 +0,0 @@ -import unittest -import paddle -from paddle import nn -paddle.disable_static(paddle.CPUPlace()) -paddle.set_default_dtype("float64") - -from parakeet.modules import connections as conn - -class TestPreLayerNormWrapper(unittest.TestCase): - def test_io(self): - net = nn.Linear(8, 8) - net = conn.PreLayerNormWrapper(net, 8) - x = paddle.randn([4, 8]) - y = net(x) - self.assertTupleEqual(x.numpy().shape, y.numpy().shape) - - -class TestPostLayerNormWrapper(unittest.TestCase): - def test_io(self): - net = nn.Linear(8, 8) - net = conn.PostLayerNormWrapper(net, 8) - x = paddle.randn([4, 8]) - y = net(x) - self.assertTupleEqual(x.numpy().shape, y.numpy().shape) - - -class TestResidualWrapper(unittest.TestCase): - def test_io(self): - net = nn.Linear(8, 8) - net = conn.ResidualWrapper(net) - x = paddle.randn([4, 8]) - y = net(x) - self.assertTupleEqual(x.numpy().shape, y.numpy().shape) \ No newline at end of file diff --git a/tests/test_conv.py b/tests/test_conv.py deleted file mode 100644 index b76e719..0000000 --- a/tests/test_conv.py +++ /dev/null @@ -1,67 +0,0 @@ -import paddle -paddle.set_default_dtype("float64") -paddle.disable_static(paddle.CPUPlace()) -import unittest -import numpy as np - -from parakeet.modules import conv - -class TestConv1dCell(unittest.TestCase): - def setUp(self): - self.net = conv.Conv1dCell(4, 6, 5, dilation=2) - - def forward_incremental(self, x): - outs = [] - self.net.start_sequence() - with paddle.no_grad(): - for i in range(x.shape[-1]): - xt = x[:, :, i] - yt = self.net.add_input(xt) - outs.append(yt) - y2 = paddle.stack(outs, axis=-1) - return y2 - - def test_equality(self): - x = paddle.randn([2, 4, 16]) - y1 = self.net(x) - - self.net.eval() - y2 = self.forward_incremental(x) - - np.testing.assert_allclose(y2.numpy(), y1.numpy()) - - -class TestConv1dBatchNorm(unittest.TestCase): - def __init__(self, methodName="runTest", causal=False, channel_last=False): - super(TestConv1dBatchNorm, self).__init__(methodName) - self.causal = causal - self.channel_last = channel_last - - def setUp(self): - k = 5 - paddding = (k - 1, 0) if self.causal else ((k-1) // 2, k //2) - self.net = conv.Conv1dBatchNorm(4, 6, (k,), 1, padding=paddding, - data_format="NLC" if self.channel_last else "NCL") - - def test_input_output(self): - x = paddle.randn([4, 16, 4]) if self.channel_last else paddle.randn([4, 4, 16]) - out = self.net(x) - out_np = out.numpy() - if self.channel_last: - self.assertTupleEqual(out_np.shape, (4, 16, 6)) - else: - self.assertTupleEqual(out_np.shape, (4, 6, 16)) - - def runTest(self): - self.test_input_output() - - -def load_tests(loader, standard_tests, pattern): - suite = unittest.TestSuite() - suite.addTest(TestConv1dBatchNorm("runTest", True, True)) - suite.addTest(TestConv1dBatchNorm("runTest", False, False)) - suite.addTest(TestConv1dBatchNorm("runTest", True, False)) - suite.addTest(TestConv1dBatchNorm("runTest", False, True)) - suite.addTest(TestConv1dCell("test_equality")) - - return suite \ No newline at end of file diff --git a/tests/test_dataset.py b/tests/test_dataset.py deleted file mode 100644 index eafd74a..0000000 --- a/tests/test_dataset.py +++ /dev/null @@ -1,122 +0,0 @@ -import unittest -import numpy as np -import paddle -from paddle import io -from parakeet import data - -class MyDataset(io.Dataset): - def __init__(self, size): - self._data = np.random.randn(size, 6) - - def __getitem__(self, i): - return self._data[i] - - def __len__(self): - return self._data.shape[0] - - -class TestTransformDataset(unittest.TestCase): - def test(self): - dataset = MyDataset(20) - dataset = data.TransformDataset(dataset, lambda x: np.abs(x)) - dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1) - print("TransformDataset") - for batch, in dataloader: - print(type(batch), batch.dtype, batch.shape) - - -class TestChainDataset(unittest.TestCase): - def test(self): - dataset1 = MyDataset(20) - dataset2 = MyDataset(40) - dataset = data.ChainDataset(dataset1, dataset2) - dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1) - print("ChainDataset") - for batch, in dataloader: - print(type(batch), batch.dtype, batch.shape) - - -class TestTupleDataset(unittest.TestCase): - def test(self): - dataset1 = MyDataset(20) - dataset2 = MyDataset(20) - dataset = data.TupleDataset(dataset1, dataset2) - dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1) - print("TupleDataset") - for field1, field2 in dataloader: - print(type(field1), field1.dtype, field1.shape) - print(type(field2), field2.dtype, field2.shape) - - -class TestDictDataset(unittest.TestCase): - def test(self): - dataset1 = MyDataset(20) - dataset2 = MyDataset(20) - dataset = data.DictDataset(field1=dataset1, field2=dataset2) - def collate_fn(examples): - examples_tuples = [] - for example in examples: - examples_tuples.append(example.values()) - return paddle.fluid.dataloader.dataloader_iter.default_collate_fn(examples_tuples) - - dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1, collate_fn=collate_fn) - print("DictDataset") - for field1, field2 in dataloader: - print(type(field1), field1.dtype, field1.shape) - print(type(field2), field2.dtype, field2.shape) - - -class TestSliceDataset(unittest.TestCase): - def test(self): - dataset = MyDataset(40) - dataset = data.SliceDataset(dataset, 0, 20) - dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1) - print("SliceDataset") - for batch, in dataloader: - print(type(batch), batch.dtype, batch.shape) - - -class TestSplit(unittest.TestCase): - def test(self): - dataset = MyDataset(40) - train, valid = data.split(dataset, 10) - dataloader1 = io.DataLoader(train, batch_size=4, shuffle=True, num_workers=1) - dataloader2 = io.DataLoader(valid, batch_size=4, shuffle=True, num_workers=1) - print("First Dataset") - for batch, in dataloader1: - print(type(batch), batch.dtype, batch.shape) - - print("Second Dataset") - for batch, in dataloader2: - print(type(batch), batch.dtype, batch.shape) - - -class TestSubsetDataset(unittest.TestCase): - def test(self): - dataset = MyDataset(40) - indices = np.random.choice(np.arange(40), [20], replace=False).tolist() - dataset = data.SubsetDataset(dataset, indices) - dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1) - print("SubsetDataset") - for batch, in dataloader: - print(type(batch), batch.dtype, batch.shape) - - -class TestFilterDataset(unittest.TestCase): - def test(self): - dataset = MyDataset(40) - dataset = data.FilterDataset(dataset, lambda x: np.mean(x)> 0.3) - dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1) - print("FilterDataset") - for batch, in dataloader: - print(type(batch), batch.dtype, batch.shape) - - -class TestCacheDataset(unittest.TestCase): - def test(self): - dataset = MyDataset(40) - dataset = data.CacheDataset(dataset) - dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1) - print("CacheDataset") - for batch, in dataloader: - print(type(batch), batch.dtype, batch.shape) diff --git a/tests/test_deepvoice3.py b/tests/test_deepvoice3.py deleted file mode 100644 index 5abe272..0000000 --- a/tests/test_deepvoice3.py +++ /dev/null @@ -1,107 +0,0 @@ -import numpy as np -import unittest -import paddle -paddle.set_default_dtype("float64") -paddle.disable_static(paddle.CPUPlace()) - -from parakeet.models import deepvoice3 as dv3 - -class TestConvBlock(unittest.TestCase): - def test_io_causal(self): - net = dv3.ConvBlock(6, 5, True, True, 8, 0.9) - x = paddle.randn([4, 32, 6]) - condition = paddle.randn([4, 8]) - # TODO(chenfeiyu): to report an issue on default data type - padding = paddle.zeros([4, 4, 6], dtype=x.dtype) - y = net.forward(x, condition, padding) - self.assertTupleEqual(y.numpy().shape, (4, 32, 6)) - - def test_io_non_causal(self): - net = dv3.ConvBlock(6, 5, False, True, 8, 0.9) - x = paddle.randn([4, 32, 6]) - condition = paddle.randn([4, 8]) - y = net.forward(x, condition) - self.assertTupleEqual(y.numpy().shape, (4, 32, 6)) - - -class TestAffineBlock1(unittest.TestCase): - def test_io(self): - net = dv3.AffineBlock1(6, 16, True, 8) - x = paddle.randn([4, 32, 6]) - condition = paddle.randn([4, 8]) - y = net(x, condition) - self.assertTupleEqual(y.numpy().shape, (4, 32, 16)) - - -class TestAffineBlock2(unittest.TestCase): - def test_io(self): - net = dv3.AffineBlock2(6, 16, True, 8) - x = paddle.randn([4, 32, 6]) - condition = paddle.randn([4, 8]) - y = net(x, condition) - self.assertTupleEqual(y.numpy().shape, (4, 32, 16)) - - -class TestEncoder(unittest.TestCase): - def test_io(self): - net = dv3.Encoder(5, 8, 16, 5, True, 6) - x = paddle.randn([4, 32, 8]) - condition = paddle.randn([4, 6]) - keys, values = net(x, condition) - self.assertTupleEqual(keys.numpy().shape, (4, 32, 8)) - self.assertTupleEqual(values.numpy().shape, (4, 32, 8)) - - -class TestAttentionBlock(unittest.TestCase): - def test_io(self): - net = dv3.AttentionBlock(16, 6, has_bias=True, bias_dim=8) - q = paddle.randn([4, 32, 6]) - k = paddle.randn([4, 24, 6]) - v = paddle.randn([4, 24, 6]) - lengths = paddle.to_tensor([24, 20, 19, 23], dtype="int64") - condition = paddle.randn([4, 8]) - context_vector, attention_weight = net(q, k, v, lengths, condition, 0) - self.assertTupleEqual(context_vector.numpy().shape, (4, 32, 6)) - self.assertTupleEqual(attention_weight.numpy().shape, (4, 32, 24)) - - def test_io_with_previous_attn(self): - net = dv3.AttentionBlock(16, 6, has_bias=True, bias_dim=8) - q = paddle.randn([4, 32, 6]) - k = paddle.randn([4, 24, 6]) - v = paddle.randn([4, 24, 6]) - lengths = paddle.to_tensor([24, 20, 19, 23], dtype="int64") - condition = paddle.randn([4, 8]) - prev_attn_weight = paddle.randn([4, 32, 16]) - - context_vector, attention_weight = net( - q, k, v, lengths, condition, 0, - force_monotonic=True, prev_coeffs=prev_attn_weight, window=(0, 4)) - self.assertTupleEqual(context_vector.numpy().shape, (4, 32, 6)) - self.assertTupleEqual(attention_weight.numpy().shape, (4, 32, 24)) - - -class TestDecoder(unittest.TestCase): - def test_io(self): - net = dv3.Decoder(8, 4, [4, 12], 5, 3, 16, 1.0, 1.45, True, 6) - x = paddle.randn([4, 32, 8]) - k = paddle.randn([4, 24, 12]) # prenet's last size should equals k's feature size - v = paddle.randn([4, 24, 12]) - lengths = paddle.to_tensor([24, 18, 19, 22]) - condition = paddle.randn([4, 6]) - decoded, hidden, attentions, final_state = net(x, k, v, lengths, 0, condition) - self.assertTupleEqual(decoded.numpy().shape, (4, 32, 4 * 8)) - self.assertTupleEqual(hidden.numpy().shape, (4, 32, 12)) - self.assertEqual(len(attentions), 5) - self.assertTupleEqual(attentions[0].numpy().shape, (4, 32, 24)) - self.assertEqual(len(final_state), 5) - self.assertTupleEqual(final_state[0].numpy().shape, (4, 2, 12)) - - -class TestPostNet(unittest.TestCase): - def test_io(self): - net = dv3.PostNet(3, 8, 16, 3, 12, 4, True, 6) - x = paddle.randn([4, 32, 8]) - condition = paddle.randn([4, 6]) - y = net(x, condition) - self.assertTupleEqual(y.numpy().shape, (4, 32 * 4, 12)) - diff --git a/tests/test_geometry.py b/tests/test_geometry.py deleted file mode 100644 index 1c0efeb..0000000 --- a/tests/test_geometry.py +++ /dev/null @@ -1,19 +0,0 @@ -import unittest -import numpy as np - -import paddle -paddle.set_default_dtype("float64") -paddle.disable_static(paddle.CPUPlace()) - -from parakeet.modules import geometry as geo - -class TestShuffleDim(unittest.TestCase): - def test_perm(self): - x = paddle.randn([2, 3, 4, 6]) - y = geo.shuffle_dim(x, 2, [3, 2, 1, 0]) - np.testing.assert_allclose(x.numpy()[0, 0, :, 0], y.numpy()[0, 0, ::-1, 0]) - - def test_random_perm(self): - x = paddle.randn([2, 3, 4, 6]) - y = geo.shuffle_dim(x, 2) - np.testing.assert_allclose(x.numpy().sum(2), y.numpy().sum(2)) \ No newline at end of file diff --git a/tests/test_losses.py b/tests/test_losses.py deleted file mode 100644 index fa38eee..0000000 --- a/tests/test_losses.py +++ /dev/null @@ -1,33 +0,0 @@ -import unittest -import paddle -paddle.set_device("cpu") -import numpy as np - -from parakeet.modules.losses import weighted_mean, masked_l1_loss, masked_softmax_with_cross_entropy - -class TestWeightedMean(unittest.TestCase): - def test(self): - x = paddle.arange(0, 10, dtype="float64").unsqueeze(-1).broadcast_to([10, 3]) - mask = (paddle.arange(0, 10, dtype="float64") > 4).unsqueeze(-1) - loss = weighted_mean(x, mask) - self.assertAlmostEqual(loss.numpy()[0], 7) - - -class TestMaskedL1Loss(unittest.TestCase): - def test(self): - x = paddle.arange(0, 10, dtype="float64").unsqueeze(-1).broadcast_to([10, 3]) - y = paddle.zeros_like(x) - mask = (paddle.arange(0, 10, dtype="float64") > 4).unsqueeze(-1) - loss = masked_l1_loss(x, y, mask) - print(loss) - self.assertAlmostEqual(loss.numpy()[0], 7) - - -class TestMaskedCrossEntropy(unittest.TestCase): - def test(self): - x = paddle.randn([3, 30, 8], dtype="float64") - y = paddle.randint(0, 8, [3, 30], dtype="int64").unsqueeze(-1) # mind this - mask = paddle.fluid.layers.sequence_mask( - paddle.to_tensor([30, 18, 27]), dtype="int64").unsqueeze(-1) - loss = masked_softmax_with_cross_entropy(x, y, mask) - print(loss) diff --git a/tests/test_masking.py b/tests/test_masking.py deleted file mode 100644 index c1a388b..0000000 --- a/tests/test_masking.py +++ /dev/null @@ -1,54 +0,0 @@ -import unittest -import numpy as np -import paddle -paddle.set_default_dtype("float64") - -from parakeet.modules import masking - - -def sequence_mask(lengths, max_length=None, dtype="bool"): - max_length = max_length or np.max(lengths) - ids = np.arange(max_length) - return (ids < np.expand_dims(lengths, -1)).astype(dtype) - -def future_mask(lengths, max_length=None, dtype="bool"): - max_length = max_length or np.max(lengths) - return np.tril(np.tril(np.ones(max_length))).astype(dtype) - -class TestIDMask(unittest.TestCase): - def test(self): - ids = paddle.to_tensor( - [[1, 2, 3, 0, 0, 0], - [2, 4, 5, 6, 0, 0], - [7, 8, 9, 0, 0, 0]] - ) - mask = masking.id_mask(ids) - self.assertTupleEqual(mask.numpy().shape, ids.numpy().shape) - print(mask.numpy()) - -class TestFeatureMask(unittest.TestCase): - def test(self): - features = np.random.randn(3, 16, 8) - lengths = [16, 14, 12] - for i, length in enumerate(lengths): - features[i, length:, :] = 0 - - feature_tensor = paddle.to_tensor(features) - mask = masking.feature_mask(feature_tensor, -1) - self.assertTupleEqual(mask.numpy().shape, (3, 16, 1)) - print(mask.numpy().squeeze()) - - -class TestCombineMask(unittest.TestCase): - def test_bool_mask(self): - lengths = np.array([12, 8, 9, 10]) - padding_mask = sequence_mask(lengths, dtype="bool") - no_future_mask = future_mask(lengths, dtype="bool") - combined_mask1 = np.expand_dims(padding_mask, 1) * no_future_mask - - print(paddle.to_tensor(padding_mask).dtype) - print(paddle.to_tensor(no_future_mask).dtype) - combined_mask2 = masking.combine_mask( - paddle.to_tensor(padding_mask).unsqueeze(1), paddle.to_tensor(no_future_mask) - ) - np.testing.assert_allclose(combined_mask2.numpy(), combined_mask1) diff --git a/tests/test_position_encoding.py b/tests/test_position_encoding.py deleted file mode 100644 index 408c0d2..0000000 --- a/tests/test_position_encoding.py +++ /dev/null @@ -1,64 +0,0 @@ -import unittest -import numpy as np -import paddle - -from parakeet.modules import positional_encoding as pe - -def positional_encoding(start_index, length, size, dtype="float32"): - if (size % 2 != 0): - raise ValueError("size should be divisible by 2") - channel = np.arange(0, size, 2, dtype=dtype) - index = np.arange(start_index, start_index + length, 1, dtype=dtype) - p = np.expand_dims(index, -1) / (10000 ** (channel / float(size))) - encodings = np.concatenate([np.sin(p), np.cos(p)], axis=-1) - return encodings - -def scalable_positional_encoding(start_index, length, size, omega): - dtype = omega.dtype - index = np.arange(start_index, start_index + length, 1, dtype=dtype) - channel = np.arange(0, size, 2, dtype=dtype) - - p = np.reshape(omega, omega.shape + (1, 1)) \ - * np.expand_dims(index, -1) \ - / (10000 ** (channel / float(size))) - - encodings = np.concatenate([np.sin(p), np.cos(p)], axis=-1) - return encodings - -class TestPositionEncoding(unittest.TestCase): - def __init__(self, start=0, length=20, size=16, dtype="float64"): - super(TestPositionEncoding, self).__init__("runTest") - self.spec = (start, length, size, dtype) - - def test_equality(self): - start, length, size, dtype = self.spec - position_embed1 = positional_encoding(start, length, size, dtype) - position_embed2 = pe.positional_encoding(start, length, size, dtype) - np.testing.assert_allclose(position_embed2.numpy(), position_embed1) - - def runTest(self): - paddle.disable_static(paddle.CPUPlace()) - self.test_equality() - -class TestScalablePositionEncoding(unittest.TestCase): - def __init__(self, start=0, length=20, size=16, dtype="float64"): - super(TestScalablePositionEncoding, self).__init__("runTest") - self.spec = (start, length, size, dtype) - - def test_equality(self): - start, length, size, dtype = self.spec - omega = np.random.uniform(1, 2, size=(4,)).astype(dtype) - position_embed1 = scalable_positional_encoding(start, length, size, omega) - position_embed2 = pe.scalable_positional_encoding(start, length, size, paddle.to_tensor(omega)) - np.testing.assert_allclose(position_embed2.numpy(), position_embed1) - - def runTest(self): - paddle.disable_static(paddle.CPUPlace()) - self.test_equality() - - -def load_tests(loader, standard_tests, pattern): - suite = unittest.TestSuite() - suite.addTest(TestPositionEncoding(0, 20, 16, "float64")) - suite.addTest(TestScalablePositionEncoding(0, 20, 16)) - return suite \ No newline at end of file diff --git a/tests/test_stft.py b/tests/test_stft.py deleted file mode 100644 index ac66d24..0000000 --- a/tests/test_stft.py +++ /dev/null @@ -1,27 +0,0 @@ -import unittest -import numpy as np -import librosa -import paddle -paddle.set_default_dtype("float64") -paddle.disable_static(paddle.CPUPlace()) - -from parakeet.modules import stft - -class TestSTFT(unittest.TestCase): - def test(self): - path = librosa.util.example("choice") - wav, sr = librosa.load(path, duration=5) - wav = wav.astype("float64") - - spec = librosa.stft(wav, n_fft=2048, hop_length=256, win_length=1024) - mag1 = np.abs(spec) - - wav_in_batch = paddle.unsqueeze(paddle.to_tensor(wav), 0) - mag2 = stft.STFT(2048, 256, 1024).magnitude(wav_in_batch) - mag2 = paddle.squeeze(mag2, [0, 2]).numpy() - - print("mag1", mag1) - print("mag2", mag2) - # TODO(chenfeiyu): Is there something wrong? there is some elements that - # does not match - # np.testing.assert_allclose(mag2, mag1) diff --git a/tests/test_transformer.py b/tests/test_transformer.py deleted file mode 100644 index 41b79bc..0000000 --- a/tests/test_transformer.py +++ /dev/null @@ -1,43 +0,0 @@ -import unittest -import numpy as np -import paddle -paddle.set_default_dtype("float64") -paddle.disable_static(paddle.CPUPlace()) - -from parakeet.modules import transformer - -class TestPositionwiseFFN(unittest.TestCase): - def test_io(self): - net = transformer.PositionwiseFFN(8, 12) - x = paddle.randn([2, 3, 4, 8]) - y = net(x) - self.assertTupleEqual(y.numpy().shape, (2, 3, 4, 8)) - - -class TestTransformerEncoderLayer(unittest.TestCase): - def test_io(self): - net = transformer.TransformerEncoderLayer(64, 8, 128, 0.5) - x = paddle.randn([4, 12, 64]) - lengths = paddle.to_tensor([12, 8, 9, 10]) - mask = paddle.fluid.layers.sequence_mask(lengths, dtype=x.dtype) - y, attn_weights = net(x, mask) - - self.assertTupleEqual(y.numpy().shape, (4, 12, 64)) - self.assertTupleEqual(attn_weights.numpy().shape, (4, 8, 12, 12)) - - -class TestTransformerDecoderLayer(unittest.TestCase): - def test_io(self): - net = transformer.TransformerDecoderLayer(64, 8, 128, 0.5) - q = paddle.randn([4, 32, 64]) - k = paddle.randn([4, 24, 64]) - v = paddle.randn([4, 24, 64]) - enc_lengths = paddle.to_tensor([24, 18, 20, 22]) - dec_lengths = paddle.to_tensor([32, 28, 30, 31]) - enc_mask = paddle.fluid.layers.sequence_mask(enc_lengths, dtype=k.dtype) - dec_mask = paddle.fluid.layers.sequence_mask(dec_lengths, dtype=q.dtype) - y, self_attn_weights, cross_attn_weights = net(q, k, v, enc_mask, dec_mask) - - self.assertTupleEqual(y.numpy().shape, (4, 32, 64)) - self.assertTupleEqual(self_attn_weights.numpy().shape, (4, 8, 32, 32)) - self.assertTupleEqual(cross_attn_weights.numpy().shape, (4, 8, 32, 24)) \ No newline at end of file diff --git a/tests/test_transformer_tts.py b/tests/test_transformer_tts.py deleted file mode 100644 index a13990d..0000000 --- a/tests/test_transformer_tts.py +++ /dev/null @@ -1,121 +0,0 @@ -import unittest -import numpy as np -import paddle -paddle.set_default_dtype("float64") -paddle.disable_static(paddle.CPUPlace()) - -from parakeet.models import transformer_tts as tts -from parakeet.modules import masking -from pprint import pprint - -class TestMultiheadAttention(unittest.TestCase): - def test_io_same_qk(self): - net = tts.MultiheadAttention(64, 8) - q = paddle.randn([4, 12, 64]) - mask = paddle.fluid.layers.sequence_mask( - paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype) - mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q - context_vector, attention_weights = net(q, q, q, mask, drop_n_heads=2) - self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64)) - self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12)) - - def test_io(self): - net = tts.MultiheadAttention(64, 8, k_dim=12, v_dim=6) - q = paddle.randn([4, 12, 64]) - mask = paddle.fluid.layers.sequence_mask( - paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype) - mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q - context_vector, attention_weights = net(q, q, q, mask, drop_n_heads=2) - self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64)) - self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12)) - - -class TestTransformerEncoderLayer(unittest.TestCase): - def test_io(self): - net = tts.TransformerEncoderLayer(64, 8, 128) - x = paddle.randn([4, 12, 64]) - mask = paddle.fluid.layers.sequence_mask( - paddle.to_tensor([12, 10, 8, 9]), dtype=x.dtype) - context_vector, attention_weights = net(x, mask) - self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64)) - self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12)) - - -class TestTransformerDecoderLayer(unittest.TestCase): - def test_io(self): - net = tts.TransformerDecoderLayer(64, 8, 128, 0.5) - q = paddle.randn([4, 32, 64]) - k = paddle.randn([4, 24, 64]) - v = paddle.randn([4, 24, 64]) - enc_lengths = paddle.to_tensor([24, 18, 20, 22]) - dec_lengths = paddle.to_tensor([32, 28, 30, 31]) - enc_mask = masking.sequence_mask(enc_lengths, dtype=k.dtype) - dec_padding_mask = masking.sequence_mask(dec_lengths, dtype=q.dtype) - no_future_mask = masking.future_mask(32, dtype=q.dtype) - dec_mask = masking.combine_mask(dec_padding_mask.unsqueeze(-1), no_future_mask) - y, self_attn_weights, cross_attn_weights = net(q, k, v, enc_mask, dec_mask) - - self.assertTupleEqual(y.numpy().shape, (4, 32, 64)) - self.assertTupleEqual(self_attn_weights.numpy().shape, (4, 8, 32, 32)) - self.assertTupleEqual(cross_attn_weights.numpy().shape, (4, 8, 32, 24)) - - -class TestTransformerTTS(unittest.TestCase): - def setUp(self): - net = tts.TransformerTTS( - 128, 0, 64, 128, 80, 4, 128, - 6, 6, 128, 128, 4, - 3, 10, 0.1) - self.net = net - - def test_encode_io(self): - net = self.net - - text = paddle.randint(0, 128, [4, 176]) - lengths = paddle.to_tensor([176, 156, 174, 168]) - mask = masking.sequence_mask(lengths, dtype=text.dtype) - text = text * mask - - encoded, attention_weights, encoder_mask = net.encode(text) - print("output shapes:") - print("encoded:", encoded.numpy().shape) - print("encoder_attentions:", [item.shape for item in attention_weights]) - print("encoder_mask:", encoder_mask.numpy().shape) - - def test_all_io(self): - net = self.net - - text = paddle.randint(0, 128, [4, 176]) - lengths = paddle.to_tensor([176, 156, 174, 168]) - mask = masking.sequence_mask(lengths, dtype=text.dtype) - text = text * mask - - mel = paddle.randn([4, 189, 80]) - frames = paddle.to_tensor([189, 186, 179, 174]) - mask = masking.sequence_mask(frames, dtype=frames.dtype) - mel = mel * mask.unsqueeze(-1) - - encoded, encoder_attention_weights, encoder_mask = net.encode(text) - mel_output, mel_intermediate, cross_attention_weights, stop_logits = net.decode(encoded, mel, encoder_mask) - - print("output shapes:") - print("encoder_output:", encoded.numpy().shape) - print("encoder_attentions:", [item.shape for item in encoder_attention_weights]) - print("encoder_mask:", encoder_mask.numpy().shape) - print("mel_output: ", mel_output.numpy().shape) - print("mel_intermediate: ", mel_intermediate.numpy().shape) - print("decoder_attentions:", [item.shape for item in cross_attention_weights]) - print("stop_logits:", stop_logits.numpy().shape) - - def test_predict_io(self): - net = self.net - net.eval() - with paddle.no_grad(): - text = paddle.randint(0, 128, [176]) - decoder_output, encoder_attention_weights, cross_attention_weights = net.predict(text) - - print("output shapes:") - print("mel_output: ", decoder_output.numpy().shape) - print("encoder_attentions:", [item.shape for item in encoder_attention_weights]) - print("decoder_attentions:", [item.shape for item in cross_attention_weights]) - \ No newline at end of file diff --git a/tests/test_waveflow.py b/tests/test_waveflow.py deleted file mode 100644 index 15bbc44..0000000 --- a/tests/test_waveflow.py +++ /dev/null @@ -1,130 +0,0 @@ -import numpy as np -import unittest - -import paddle -paddle.set_default_dtype("float64") -paddle.disable_static(paddle.CPUPlace()) - -from parakeet.models import waveflow - -class TestFold(unittest.TestCase): - def test_audio(self): - x = paddle.randn([4, 32 * 8]) - y = waveflow.fold(x, 8) - self.assertTupleEqual(y.numpy().shape, (4, 32, 8)) - - def test_spec(self): - x = paddle.randn([4, 80, 32 * 8]) - y = waveflow.fold(x, 8) - self.assertTupleEqual(y.numpy().shape, (4, 80, 32, 8)) - - -class TestUpsampleNet(unittest.TestCase): - def test_io(self): - net = waveflow.UpsampleNet([2, 2]) - x = paddle.randn([4, 8, 6]) - y = net(x) - self.assertTupleEqual(y.numpy().shape, (4, 8, 2 * 2 * 6)) - - -class TestResidualBlock(unittest.TestCase): - def test_io(self): - net = waveflow.ResidualBlock(4, 6, (3, 3), (2, 2)) - x = paddle.randn([4, 4, 16, 32]) - condition = paddle.randn([4, 6, 16, 32]) - res, skip = net(x, condition) - self.assertTupleEqual(res.numpy().shape, (4, 4, 16, 32)) - self.assertTupleEqual(skip.numpy().shape, (4, 4, 16, 32)) - - def test_add_input(self): - net = waveflow.ResidualBlock(4, 6, (3, 3), (2, 2)) - net.eval() - net.start_sequence() - - x_row = paddle.randn([4, 4, 1, 32]) - condition_row = paddle.randn([4, 6, 1, 32]) - - res, skip = net.add_input(x_row, condition_row) - self.assertTupleEqual(res.numpy().shape, (4, 4, 1, 32)) - self.assertTupleEqual(skip.numpy().shape, (4, 4, 1, 32)) - - -class TestResidualNet(unittest.TestCase): - def test_io(self): - net = waveflow.ResidualNet(8, 6, 8, (3, 3), [1, 1, 1, 1, 1, 1, 1, 1]) - x = paddle.randn([4, 6, 8, 32]) - condition = paddle.randn([4, 8, 8, 32]) - y = net(x, condition) - self.assertTupleEqual(y.numpy().shape, (4, 6, 8, 32)) - - def test_add_input(self): - net = waveflow.ResidualNet(8, 6, 8, (3, 3), [1, 1, 1, 1, 1, 1, 1, 1]) - net.eval() - net.start_sequence() - - x_row = paddle.randn([4, 6, 1, 32]) - condition_row = paddle.randn([4, 8, 1, 32]) - - y_row = net.add_input(x_row, condition_row) - self.assertTupleEqual(y_row.numpy().shape, (4, 6, 1, 32)) - - -class TestFlow(unittest.TestCase): - def test_io(self): - net = waveflow.Flow(8, 16, 7, (3, 3), 8) - - x = paddle.randn([4, 1, 8, 32]) - condition = paddle.randn([4, 7, 8, 32]) - z, (logs, b) = net(x, condition) - self.assertTupleEqual(z.numpy().shape, (4, 1, 8, 32)) - self.assertTupleEqual(logs.numpy().shape, (4, 1, 7, 32)) - self.assertTupleEqual(b.numpy().shape, (4, 1, 7, 32)) - - def test_inverse_row(self): - net = waveflow.Flow(8, 16, 7, (3, 3), 8) - net.eval() - net._start_sequence() - - x_row = paddle.randn([4, 1, 1, 32]) # last row - condition_row = paddle.randn([4, 7, 1, 32]) - z_row = paddle.randn([4, 1, 1, 32]) - x_next_row, (logs, b) = net._inverse_row(z_row, x_row, condition_row) - - self.assertTupleEqual(x_next_row.numpy().shape, (4, 1, 1, 32)) - self.assertTupleEqual(logs.numpy().shape, (4, 1, 1, 32)) - self.assertTupleEqual(b.numpy().shape, (4, 1, 1, 32)) - - def test_inverse(self): - net = waveflow.Flow(8, 16, 7, (3, 3), 8) - net.eval() - - z = paddle.randn([4, 1, 8, 32]) - condition = paddle.randn([4, 7, 8, 32]) - - with paddle.no_grad(): - x, (logs, b) = net.inverse(z, condition) - self.assertTupleEqual(x.numpy().shape, (4, 1, 8, 32)) - self.assertTupleEqual(logs.numpy().shape, (4, 1, 7, 32)) - self.assertTupleEqual(b.numpy().shape, (4, 1, 7, 32)) - - -class TestWaveFlow(unittest.TestCase): - def test_io(self): - x = paddle.randn([4, 32 * 8 ]) - condition = paddle.randn([4, 7, 32 * 8]) - net = waveflow.WaveFlow(2, 8, 8, 16, 7, (3, 3)) - z, logs_det_jacobian = net(x, condition) - - self.assertTupleEqual(z.numpy().shape, (4, 32 * 8)) - self.assertTupleEqual(logs_det_jacobian.numpy().shape, (1,)) - - def test_inverse(self): - z = paddle.randn([4, 32 * 8 ]) - condition = paddle.randn([4, 7, 32 * 8]) - - net = waveflow.WaveFlow(2, 8, 8, 16, 7, (3, 3)) - net.eval() - - with paddle.no_grad(): - x = net.inverse(z, condition) - self.assertTupleEqual(x.numpy().shape, (4, 32 * 8)) From e03e96d9e4c2b91eb3f4128b1c3a241bc08b72d3 Mon Sep 17 00:00:00 2001 From: iclementine Date: Sun, 20 Dec 2020 13:15:07 +0800 Subject: [PATCH 2/3] format all the code with yapf --- README_cn.md | 2 +- doc/source/conf.py | 19 +- docs/config_cn.md | 10 +- docs/data_cn.md | 14 +- docs/experiment_cn.md | 2 +- docs/experiment_guide_cn.md | 2 - docs/installation_cn.md | 2 +- docs/overview_cn.md | 7 - examples/transformer_tts/config.py | 56 ++-- examples/transformer_tts/ljspeech.py | 37 ++- examples/transformer_tts/preprocess.py | 50 +++- examples/transformer_tts/synthesize.py | 55 +++- examples/transformer_tts/train.py | 74 +++--- examples/waveflow/config.py | 58 +++-- examples/waveflow/ljspeech.py | 36 ++- examples/waveflow/preprocess.py | 52 ++-- examples/waveflow/synthesize.py | 54 +++- examples/waveflow/train.py | 48 +++- examples/wavenet/config.py | 55 ++-- examples/wavenet/ljspeech.py | 37 ++- examples/wavenet/preprocess.py | 54 ++-- examples/wavenet/synthesize.py | 53 +++- examples/wavenet/train.py | 55 ++-- parakeet/audio/audio.py | 34 +-- parakeet/audio/spec_normalizer.py | 26 +- parakeet/data/batch.py | 30 ++- parakeet/data/dataset.py | 14 +- parakeet/datasets/__init__.py | 14 + parakeet/datasets/common.py | 17 +- parakeet/datasets/ljspeech.py | 16 +- parakeet/frontend/__init__.py | 14 + parakeet/frontend/normalizer/__init__.py | 14 + parakeet/frontend/normalizer/abbrrviation.py | 14 + parakeet/frontend/normalizer/acronyms.py | 14 + parakeet/frontend/normalizer/width.py | 25 +- parakeet/frontend/punctuation.py | 26 +- parakeet/models/transformer_tts.py | 7 +- parakeet/models/waveflow.py | 257 +++++++++++-------- parakeet/models/wavenet.py | 135 +++++----- parakeet/modules/audio.py | 20 +- parakeet/modules/conv.py | 2 + parakeet/modules/geometry.py | 17 +- parakeet/modules/losses.py | 36 ++- parakeet/modules/masking.py | 15 ++ parakeet/modules/positional_encoding.py | 17 +- parakeet/modules/transformer.py | 74 +++--- parakeet/training/__init__.py | 14 + parakeet/training/cli.py | 17 +- parakeet/training/default_config.py | 26 +- parakeet/training/experiment.py | 1 + parakeet/utils/checkpoint.py | 15 +- parakeet/utils/internals.py | 14 + parakeet/utils/layer_tools.py | 4 + parakeet/utils/mp_tools.py | 21 +- parakeet/utils/scheduler.py | 25 +- setup.py | 14 +- 56 files changed, 1252 insertions(+), 569 deletions(-) diff --git a/README_cn.md b/README_cn.md index 994a4e2..ce88032 100644 --- a/README_cn.md +++ b/README_cn.md @@ -228,6 +228,6 @@ Parakeet 同时提供了示例模型的训练好的参数,可从下表中获 正在开发中。 -## 版权和许可 +## 版权和许可 Parakeet 以 [Apache-2.0 license](LICENSE) 提供。 diff --git a/doc/source/conf.py b/doc/source/conf.py index f7d0af2..dd4a270 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full @@ -14,7 +28,6 @@ # import sys # sys.path.insert(0, os.path.abspath('.')) - # -- Project information ----------------------------------------------------- project = 'parakeet' @@ -24,7 +37,6 @@ author = 'parakeet-developers' # The full version, including alpha/beta/rc tags release = '0.2' - # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be @@ -33,7 +45,7 @@ release = '0.2' extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.viewcode', - "sphinx_rtd_theme", + "sphinx_rtd_theme", 'sphinx.ext.mathjax', 'numpydoc', ] @@ -46,7 +58,6 @@ templates_path = ['_templates'] # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] - # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for diff --git a/docs/config_cn.md b/docs/config_cn.md index 2b8ce4c..29a80c6 100644 --- a/docs/config_cn.md +++ b/docs/config_cn.md @@ -18,7 +18,7 @@ 常见的配置文件的格式有 `ini`, `yaml`, `toml`, `json` 等。 -`ini` +`ini` 优点:简单,支持字符串插值等操作。 缺点:仅支持两层结构,值不带类型信息,解析的时候需要手动 cast。 @@ -102,11 +102,3 @@ optional arguments: --opts ... options to overwrite --config file and the default config, passing in KEY VALUE pairs ``` - - - - - - - - diff --git a/docs/data_cn.md b/docs/data_cn.md index 4a7aab8..6ef6404 100644 --- a/docs/data_cn.md +++ b/docs/data_cn.md @@ -21,7 +21,7 @@ 一般来说,我们将一个 Dataset 的子类看作是数据集和实验的具体需求之间的适配器。 -parakeet 还提供了若干个高阶的 Dataset 类,用于从已有的 Dataset 产生新的 Dataset. +parakeet 还提供了若干个高阶的 Dataset 类,用于从已有的 Dataset 产生新的 Dataset. 1. 用于字段组合的有 TupleDataset, DictDataset; 2. 用于数据集切分合并的有 SliceDataset, SubsetDataset, ChainDataset; @@ -137,7 +137,7 @@ class Transform(object): self.processor = AudioProcessor( sample_rate=22050, n_fft=1024, - win_length=1024, + win_length=1024, hop_length=256, f_max=8000) self.normalizer = LogMagnitude() @@ -167,7 +167,7 @@ ljspeech = TransformDataset(meta, transform) 当然也可以选择专门写一个转换脚本把转换后的数据集保存下来,然后再写一个适配的 Dataset 子类去加载这些保存的数据。实际这么做的效率会更高。 -接下来我们需要写一个可调用对象将多个样例组成批次。因为其中的 ids 和 mel 频谱是序列数据,所以我们需要进行 padding. +接下来我们需要写一个可调用对象将多个样例组成批次。因为其中的 ids 和 mel 频谱是序列数据,所以我们需要进行 padding. ```python class LJSpeechCollector(object): @@ -197,10 +197,10 @@ def create_dataloader(source_path, valid_size, batch_size): valid_set, train_set = dataset.split(lj, valid_size) train_loader = DataLoader( - train_set, - return_list=False, - batch_size=batch_size, - shuffle=True, + train_set, + return_list=False, + batch_size=batch_size, + shuffle=True, drop_last=True, collate_fn=LJSpeechCollector()) valid_loader = DataLoader( diff --git a/docs/experiment_cn.md b/docs/experiment_cn.md index dc6a997..0596dda 100644 --- a/docs/experiment_cn.md +++ b/docs/experiment_cn.md @@ -72,4 +72,4 @@ def train(self): ```python exp.run() -``` \ No newline at end of file +``` diff --git a/docs/experiment_guide_cn.md b/docs/experiment_guide_cn.md index c5cc82e..8c9b89d 100644 --- a/docs/experiment_guide_cn.md +++ b/docs/experiment_guide_cn.md @@ -72,5 +72,3 @@ Dataset --(transform)--> Dataset --+ ``` 在这个软件源中包含了几个例子,可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户,可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。 - - diff --git a/docs/installation_cn.md b/docs/installation_cn.md index a861c86..030b721 100644 --- a/docs/installation_cn.md +++ b/docs/installation_cn.md @@ -31,7 +31,7 @@ python -m pip install paddlepaddle==2.0.0rc0 -i https://mirror.baidu.com/pypi/si # ubuntu, debian sudo apt-get install libsndfile1 -# centos, fedora, +# centos, fedora, sudo yum install libsndfile # openSUSE diff --git a/docs/overview_cn.md b/docs/overview_cn.md index 40659af..06a9f93 100644 --- a/docs/overview_cn.md +++ b/docs/overview_cn.md @@ -9,10 +9,3 @@ Parakeet 为用户和开发者提供了 1. 可复用的模型以及常用的模块; 2. 从数据处理,模型训练到预测等一系列过程的完整实验; 3. 高质量的开箱即用模型。 - - - - - - - diff --git a/examples/transformer_tts/config.py b/examples/transformer_tts/config.py index fef9ed8..bcf8e90 100644 --- a/examples/transformer_tts/config.py +++ b/examples/transformer_tts/config.py @@ -1,21 +1,34 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from yacs.config import CfgNode as CN _C = CN() _C.data = CN( dict( - batch_size=16, # batch size - valid_size=64, # the first N examples are reserved for validation - sample_rate=22050, # Hz, sample rate - n_fft=1024, # fft frame size - win_length=1024, # window size + batch_size=16, # batch size + valid_size=64, # the first N examples are reserved for validation + sample_rate=22050, # Hz, sample rate + n_fft=1024, # fft frame size + win_length=1024, # window size hop_length=256, # hop size between ajacent frame - f_max=8000, # Hz, max frequency when converting to mel + f_max=8000, # Hz, max frequency when converting to mel d_mel=80, # mel bands - padding_idx=0, # text embedding's padding index - mel_start_value=0.5, # value for starting frame - mel_end_value=-0.5, # # value for ending frame - ) -) + padding_idx=0, # text embedding's padding index + mel_start_value=0.5, # value for starting frame + mel_end_value=-0.5, # # value for ending frame + )) _C.model = CN( dict( @@ -31,22 +44,21 @@ _C.model = CN( postnet_kernel_size=5, # decoder postnet(cnn)'s kernel size max_reduction_factor=10, # max_reduction factor dropout=0.1, # global droput probability - stop_loss_scale=8.0, # scaler for stop _loss - decoder_prenet_dropout=0.5, # decoder prenet dropout probability - ) -) + stop_loss_scale=8.0, # scaler for stop _loss + decoder_prenet_dropout=0.5, # decoder prenet dropout probability + )) _C.training = CN( dict( - lr=1e-4, # learning rate + lr=1e-4, # learning rate drop_n_heads=[[0, 0], [15000, 1]], reduction_factor=[[0, 10], [80000, 4], [200000, 2]], - plot_interval=1000, # plot attention and spectrogram - valid_interval=1000, # validation - save_interval=10000, # checkpoint - max_iteration=900000, # max iteration to train - ) -) + plot_interval=1000, # plot attention and spectrogram + valid_interval=1000, # validation + save_interval=10000, # checkpoint + max_iteration=900000, # max iteration to train + )) + def get_cfg_defaults(): """Get a yacs CfgNode object with default values for my_project.""" diff --git a/examples/transformer_tts/ljspeech.py b/examples/transformer_tts/ljspeech.py index 245b475..137db96 100644 --- a/examples/transformer_tts/ljspeech.py +++ b/examples/transformer_tts/ljspeech.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os from pathlib import Path import pickle @@ -7,8 +21,10 @@ from paddle.io import Dataset, DataLoader from parakeet.data.batch import batch_spec, batch_text_id from parakeet.data import dataset + class LJSpeech(Dataset): """A simple dataset adaptor for the processed ljspeech dataset.""" + def __init__(self, root): self.root = Path(root).expanduser() records = [] @@ -35,13 +51,13 @@ class Transform(object): self.end_value = end_value def __call__(self, example): - ids, mel = example # ids already have and + ids, mel = example # ids already have and ids = np.array(ids, dtype=np.int64) # add start and end frame - mel = np.pad(mel, - [(0, 0), (1, 1)], - mode='constant', - constant_values=[(0, 0), (self.start_value, self.end_value)]) + mel = np.pad( + mel, [(0, 0), (1, 1)], + mode='constant', + constant_values=[(0, 0), (self.start_value, self.end_value)]) stop_labels = np.ones([mel.shape[1]], dtype=np.int64) stop_labels[-1] = 2 # actually this thing can also be done within the model @@ -50,6 +66,7 @@ class Transform(object): class LJSpeechCollector(object): """A simple callable to batch LJSpeech examples.""" + def __init__(self, padding_idx=0, padding_value=0.): self.padding_idx = padding_idx self.padding_value = padding_value @@ -67,15 +84,16 @@ class LJSpeechCollector(object): def create_dataloader(config, source_path): lj = LJSpeech(source_path) - transform = Transform(config.data.mel_start_value, config.data.mel_end_value) + transform = Transform(config.data.mel_start_value, + config.data.mel_end_value) lj = dataset.TransformDataset(lj, transform) valid_set, train_set = dataset.split(lj, config.data.valid_size) data_collator = LJSpeechCollector(padding_idx=config.data.padding_idx) train_loader = DataLoader( - train_set, - batch_size=config.data.batch_size, - shuffle=True, + train_set, + batch_size=config.data.batch_size, + shuffle=True, drop_last=True, collate_fn=data_collator) valid_loader = DataLoader( @@ -85,4 +103,3 @@ def create_dataloader(config, source_path): drop_last=False, collate_fn=data_collator) return train_loader, valid_loader - diff --git a/examples/transformer_tts/preprocess.py b/examples/transformer_tts/preprocess.py index 001f04c..2ba1985 100644 --- a/examples/transformer_tts/preprocess.py +++ b/examples/transformer_tts/preprocess.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import tqdm import pickle @@ -11,6 +25,7 @@ from parakeet.frontend import English from config import get_cfg_defaults + def create_dataset(config, source_path, target_path, verbose=False): # create output dir target_path = Path(target_path).expanduser() @@ -23,11 +38,11 @@ def create_dataset(config, source_path, target_path, verbose=False): sample_rate=config.data.sample_rate, n_fft=config.data.n_fft, n_mels=config.data.d_mel, - win_length=config.data.win_length, + win_length=config.data.win_length, hop_length=config.data.hop_length, f_max=config.data.f_max) normalizer = LogMagnitude() - + records = [] for (fname, text, _) in tqdm.tqdm(meta_data): wav = processor.read_wav(fname) @@ -42,12 +57,13 @@ def create_dataset(config, source_path, target_path, verbose=False): np.save(mel_path / mel_name, mel) if verbose: print("save mel spectrograms into {}".format(mel_path)) - + # save meta data as pickle archive with open(target_path / "metadata.pkl", 'wb') as f: pickle.dump(records, f) if verbose: - print("saved metadata into {}".format(target_path / "metadata.pkl")) + print("saved metadata into {}".format(target_path / + "metadata.pkl")) # also save meta data into text format for inspection with open(target_path / "metadata.txt", 'wt') as f: @@ -55,21 +71,31 @@ def create_dataset(config, source_path, target_path, verbose=False): phoneme_str = "|".join(phonemes) f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str)) if verbose: - print("saved metadata into {}".format(target_path / "metadata.txt")) - + print("saved metadata into {}".format(target_path / + "metadata.txt")) + print("Done.") if __name__ == "__main__": parser = argparse.ArgumentParser(description="create dataset") - parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") - parser.add_argument("--input", type=str, help="path of the ljspeech dataset") - parser.add_argument("--output", type=str, help="path to save output dataset") - parser.add_argument("--opts", nargs=argparse.REMAINDER, + parser.add_argument( + "--config", + type=str, + metavar="FILE", + help="extra config to overwrite the default config") + parser.add_argument( + "--input", type=str, help="path of the ljspeech dataset") + parser.add_argument( + "--output", type=str, help="path to save output dataset") + parser.add_argument( + "--opts", + nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" ) - parser.add_argument("-v", "--verbose", action="store_true", help="print msg") - + parser.add_argument( + "-v", "--verbose", action="store_true", help="print msg") + config = get_cfg_defaults() args = parser.parse_args() if args.config: diff --git a/examples/transformer_tts/synthesize.py b/examples/transformer_tts/synthesize.py index b8f352f..6758819 100644 --- a/examples/transformer_tts/synthesize.py +++ b/examples/transformer_tts/synthesize.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import time from pathlib import Path @@ -13,21 +27,22 @@ from parakeet.utils.display import add_attention_plots from config import get_cfg_defaults + @paddle.fluid.dygraph.no_grad def main(config, args): paddle.set_device(args.device) # model frontend = English() - model = TransformerTTS.from_pretrained( - frontend, config, args.checkpoint_path) + model = TransformerTTS.from_pretrained(frontend, config, + args.checkpoint_path) model.eval() # inputs input_path = Path(args.input).expanduser() - with open(input_path, "rt") as f: + with open(input_path, "rt") as f: sentences = f.readlines() - + output_dir = Path(args.output).expanduser() output_dir.mkdir(parents=True, exist_ok=True) @@ -38,22 +53,36 @@ def main(config, args): mel_output = mel_output.T #(C, T) np.save(str(output_dir / f"sentence_{i}"), mel_output) if args.verbose: - print("spectrogram saved at {}".format(output_dir / f"sentence_{i}.npy")) + print("spectrogram saved at {}".format(output_dir / + f"sentence_{i}.npy")) + if __name__ == "__main__": config = get_cfg_defaults() - parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.") - parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") - parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.") + parser = argparse.ArgumentParser( + description="generate mel spectrogram with TransformerTTS.") + parser.add_argument( + "--config", + type=str, + metavar="FILE", + help="extra config to overwrite the default config") + parser.add_argument( + "--checkpoint_path", type=str, help="path of the checkpoint to load.") parser.add_argument("--input", type=str, help="path of the text sentences") parser.add_argument("--output", type=str, help="path to save outputs") - parser.add_argument("--device", type=str, default="cpu", help="device type to use.") - parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") - parser.add_argument("-v", "--verbose", action="store_true", help="print msg") - + parser.add_argument( + "--device", type=str, default="cpu", help="device type to use.") + parser.add_argument( + "--opts", + nargs=argparse.REMAINDER, + help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="print msg") + args = parser.parse_args() - if args.config: + if args.config: config.merge_from_file(args.config) if args.opts: config.merge_from_list(args.opts) diff --git a/examples/transformer_tts/train.py b/examples/transformer_tts/train.py index 59ec7aa..b5ae11d 100644 --- a/examples/transformer_tts/train.py +++ b/examples/transformer_tts/train.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import time import logging from pathlib import Path @@ -19,12 +33,13 @@ from parakeet.training.experiment import ExperimentBase from config import get_cfg_defaults from ljspeech import LJSpeech, LJSpeechCollector, Transform + class Experiment(ExperimentBase): def setup_model(self): config = self.config frontend = English() model = TransformerTTS( - frontend, + frontend, d_encoder=config.model.d_encoder, d_decoder=config.model.d_decoder, d_mel=config.data.d_mel, @@ -46,8 +61,7 @@ class Experiment(ExperimentBase): beta1=0.9, beta2=0.98, epsilon=1e-9, - parameters=model.parameters() - ) + parameters=model.parameters()) criterion = TransformerTTSLoss(config.model.stop_loss_scale) drop_n_heads = scheduler.StepWise(config.training.drop_n_heads) reduction_factor = scheduler.StepWise(config.training.reduction_factor) @@ -63,21 +77,24 @@ class Experiment(ExperimentBase): config = self.config ljspeech_dataset = LJSpeech(args.data) - transform = Transform(config.data.mel_start_value, config.data.mel_end_value) - ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset, transform) - valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size) + transform = Transform(config.data.mel_start_value, + config.data.mel_end_value) + ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset, + transform) + valid_set, train_set = dataset.split(ljspeech_dataset, + config.data.valid_size) batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx) - + if not self.parallel: train_loader = DataLoader( - train_set, - batch_size=config.data.batch_size, - shuffle=True, + train_set, + batch_size=config.data.batch_size, + shuffle=True, drop_last=True, collate_fn=batch_fn) else: sampler = DistributedBatchSampler( - train_set, + train_set, batch_size=config.data.batch_size, num_replicas=dist.get_world_size(), rank=dist.get_rank(), @@ -95,11 +112,11 @@ class Experiment(ExperimentBase): def compute_outputs(self, text, mel, stop_label): model_core = self.model._layers if self.parallel else self.model model_core.set_constants( - self.reduction_factor(self.iteration), + self.reduction_factor(self.iteration), self.drop_n_heads(self.iteration)) # TODO(chenfeiyu): we can combine these 2 slices - mel_input = mel[:,:-1, :] + mel_input = mel[:, :-1, :] reduced_mel_input = mel_input[:, ::model_core.r, :] outputs = self.model(text, reduced_mel_input) return outputs @@ -115,11 +132,8 @@ class Experiment(ExperimentBase): time_steps = mel_target.shape[1] losses = self.criterion( - mel_output[:,:time_steps, :], - mel_intermediate[:,:time_steps, :], - mel_target, - stop_logits[:,:time_steps, :], - stop_label_target) + mel_output[:, :time_steps, :], mel_intermediate[:, :time_steps, :], + mel_target, stop_logits[:, :time_steps, :], stop_label_target) return losses def train_batch(self): @@ -133,7 +147,7 @@ class Experiment(ExperimentBase): outputs = self.compute_outputs(text, mel, stop_label) losses = self.compute_losses(batch, outputs) loss = losses["loss"] - loss.backward() + loss.backward() self.optimizer.step() iteration_time = time.time() - start @@ -141,14 +155,17 @@ class Experiment(ExperimentBase): # logging msg = "Rank: {}, ".format(dist.get_rank()) msg += "step: {}, ".format(self.iteration) - msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time) - msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) + msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, + iteration_time) + msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_np.items()) self.logger.info(msg) - + if dist.get_rank() == 0: for k, v in losses_np.items(): - self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration) - + self.visualizer.add_scalar(f"train_loss/{k}", v, + self.iteration) + @mp_tools.rank_zero_only @paddle.no_grad() def valid(self): @@ -163,10 +180,9 @@ class Experiment(ExperimentBase): if i < 2: attention_weights = outputs["cross_attention_weights"] display.add_multi_attention_plots( - self.visualizer, - f"valid_sentence_{i}_cross_attention_weights", - attention_weights, - self.iteration) + self.visualizer, + f"valid_sentence_{i}_cross_attention_weights", + attention_weights, self.iteration) # write visual log valid_losses = {k: np.mean(v) for k, v in valid_losses.items()} @@ -191,7 +207,7 @@ if __name__ == "__main__": config = get_cfg_defaults() parser = default_argument_parser() args = parser.parse_args() - if args.config: + if args.config: config.merge_from_file(args.config) if args.opts: config.merge_from_list(args.opts) diff --git a/examples/waveflow/config.py b/examples/waveflow/config.py index 97a877a..5ca2ba1 100644 --- a/examples/waveflow/config.py +++ b/examples/waveflow/config.py @@ -1,40 +1,52 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from yacs.config import CfgNode as CN _C = CN() _C.data = CN( dict( - batch_size=8, # batch size - valid_size=16, # the first N examples are reserved for validation - sample_rate=22050, # Hz, sample rate - n_fft=1024, # fft frame size - win_length=1024, # window size + batch_size=8, # batch size + valid_size=16, # the first N examples are reserved for validation + sample_rate=22050, # Hz, sample rate + n_fft=1024, # fft frame size + win_length=1024, # window size hop_length=256, # hop size between ajacent frame - f_max=8000, # Hz, max frequency when converting to mel + f_max=8000, # Hz, max frequency when converting to mel n_mels=80, # mel bands - clip_frames=65, # mel clip frames - ) -) + clip_frames=65, # mel clip frames + )) _C.model = CN( dict( upsample_factors=[16, 16], - n_flows=8, # number of flows in WaveFlow - n_layers=8, # number of conv block in each flow - n_group=16, # folding factor of audio and spectrogram - channels=128, # resiaudal channel in each flow - kernel_size=[3, 3], # kernel size in each conv block - sigma=1.0, # stddev of the random noise - ) -) + n_flows=8, # number of flows in WaveFlow + n_layers=8, # number of conv block in each flow + n_group=16, # folding factor of audio and spectrogram + channels=128, # resiaudal channel in each flow + kernel_size=[3, 3], # kernel size in each conv block + sigma=1.0, # stddev of the random noise + )) _C.training = CN( dict( - lr=2e-4, # learning rates - valid_interval=1000, # validation - save_interval=10000, # checkpoint - max_iteration=3000000, # max iteration to train - ) -) + lr=2e-4, # learning rates + valid_interval=1000, # validation + save_interval=10000, # checkpoint + max_iteration=3000000, # max iteration to train + )) + def get_cfg_defaults(): """Get a yacs CfgNode object with default values for my_project.""" diff --git a/examples/waveflow/ljspeech.py b/examples/waveflow/ljspeech.py index d7f5425..e07303a 100644 --- a/examples/waveflow/ljspeech.py +++ b/examples/waveflow/ljspeech.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os from pathlib import Path import pickle @@ -9,19 +23,20 @@ from parakeet.data.batch import batch_spec, batch_wav from parakeet.data import dataset from parakeet.audio import AudioProcessor + class LJSpeech(Dataset): """A simple dataset adaptor for the processed ljspeech dataset.""" + def __init__(self, root): self.root = Path(root).expanduser() meta_data = pandas.read_csv( str(self.root / "metadata.csv"), sep="\t", header=None, - names=["fname", "frames", "samples"] - ) - + names=["fname", "frames", "samples"]) + records = [] - for row in meta_data.itertuples() : + for row in meta_data.itertuples(): mel_path = str(self.root / "mel" / (row.fname + ".npy")) wav_path = str(self.root / "wav" / (row.fname + ".npy")) records.append((mel_path, wav_path)) @@ -39,6 +54,7 @@ class LJSpeech(Dataset): class LJSpeechCollector(object): """A simple callable to batch LJSpeech examples.""" + def __init__(self, padding_value=0.): self.padding_value = padding_value @@ -52,9 +68,9 @@ class LJSpeechCollector(object): class LJSpeechClipCollector(object): def __init__(self, clip_frames=65, hop_length=256): - self.clip_frames = clip_frames + self.clip_frames = clip_frames self.hop_length = hop_length - + def __call__(self, examples): mels = [] wavs = [] @@ -70,9 +86,7 @@ class LJSpeechClipCollector(object): mel, wav = example frames = mel.shape[-1] start = np.random.randint(0, frames - self.clip_frames) - mel_clip = mel[:, start: start + self.clip_frames] - wav_clip = wav[start * self.hop_length: (start + self.clip_frames) * self.hop_length] + mel_clip = mel[:, start:start + self.clip_frames] + wav_clip = wav[start * self.hop_length:(start + self.clip_frames) * + self.hop_length] return mel_clip, wav_clip - - - diff --git a/examples/waveflow/preprocess.py b/examples/waveflow/preprocess.py index d4bdc8e..ac6d62e 100644 --- a/examples/waveflow/preprocess.py +++ b/examples/waveflow/preprocess.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import tqdm import csv @@ -86,12 +100,8 @@ def create_dataset(config, input_dir, output_dir, verbose=True): output_dir = Path(output_dir).expanduser() output_dir.mkdir(exist_ok=True) - transform = Transform( - config.sample_rate, - config.n_fft, - config.win_length, - config.hop_length, - config.n_mels) + transform = Transform(config.sample_rate, config.n_fft, config.win_length, + config.hop_length, config.n_mels) file_names = [] for example in tqdm.tqdm(dataset): @@ -107,23 +117,35 @@ def create_dataset(config, input_dir, output_dir, verbose=True): np.save(str(mel_dir / base_name), mel) file_names.append((base_name, mel.shape[-1], audio.shape[-1])) - + meta_data = pd.DataFrame.from_records(file_names) - meta_data.to_csv(str(output_dir / "metadata.csv"), sep="\t", index=None, header=None) - print("saved meta data in to {}".format(os.path.join(output_dir, "metadata.csv"))) + meta_data.to_csv( + str(output_dir / "metadata.csv"), sep="\t", index=None, header=None) + print("saved meta data in to {}".format( + os.path.join(output_dir, "metadata.csv"))) print("Done!") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="create dataset") - parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") - parser.add_argument("--input", type=str, help="path of the ljspeech dataset") - parser.add_argument("--output", type=str, help="path to save output dataset") - parser.add_argument("--opts", nargs=argparse.REMAINDER, + parser.add_argument( + "--config", + type=str, + metavar="FILE", + help="extra config to overwrite the default config") + parser.add_argument( + "--input", type=str, help="path of the ljspeech dataset") + parser.add_argument( + "--output", type=str, help="path to save output dataset") + parser.add_argument( + "--opts", + nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" ) - parser.add_argument("-v", "--verbose", action="store_true", help="print msg") - + parser.add_argument( + "-v", "--verbose", action="store_true", help="print msg") + config = get_cfg_defaults() args = parser.parse_args() if args.config: diff --git a/examples/waveflow/synthesize.py b/examples/waveflow/synthesize.py index 1856eb2..45c751a 100644 --- a/examples/waveflow/synthesize.py +++ b/examples/waveflow/synthesize.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import numpy as np import soundfile as sf @@ -8,9 +22,9 @@ import parakeet from parakeet.models.waveflow import UpsampleNet, WaveFlow, ConditionalWaveFlow from parakeet.utils import layer_tools, checkpoint - from config import get_cfg_defaults + def main(config, args): paddle.set_device(args.device) model = ConditionalWaveFlow.from_pretrained(config, args.checkpoint_path) @@ -23,7 +37,8 @@ def main(config, args): for file_path in mel_dir.iterdir(): mel = np.load(str(file_path)) audio = model.predict(mel) - audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav") + audio_path = output_dir / ( + os.path.splitext(file_path.name)[0] + ".wav") sf.write(audio_path, audio, config.data.sample_rate) print("[synthesize] {} -> {}".format(file_path, audio_path)) @@ -31,17 +46,32 @@ def main(config, args): if __name__ == "__main__": config = get_cfg_defaults() - parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.") - parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") - parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.") - parser.add_argument("--input", type=str, help="path of directory containing mel spectrogram (in .npy format)") + parser = argparse.ArgumentParser( + description="generate mel spectrogram with TransformerTTS.") + parser.add_argument( + "--config", + type=str, + metavar="FILE", + help="extra config to overwrite the default config") + parser.add_argument( + "--checkpoint_path", type=str, help="path of the checkpoint to load.") + parser.add_argument( + "--input", + type=str, + help="path of directory containing mel spectrogram (in .npy format)") parser.add_argument("--output", type=str, help="path to save outputs") - parser.add_argument("--device", type=str, default="cpu", help="device type to use.") - parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") - parser.add_argument("-v", "--verbose", action="store_true", help="print msg") - + parser.add_argument( + "--device", type=str, default="cpu", help="device type to use.") + parser.add_argument( + "--opts", + nargs=argparse.REMAINDER, + help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="print msg") + args = parser.parse_args() - if args.config: + if args.config: config.merge_from_file(args.config) if args.opts: config.merge_from_list(args.opts) @@ -49,4 +79,4 @@ if __name__ == "__main__": print(config) print(args) - main(config, args) \ No newline at end of file + main(config, args) diff --git a/examples/waveflow/train.py b/examples/waveflow/train.py index 1cd68f0..443cc8b 100644 --- a/examples/waveflow/train.py +++ b/examples/waveflow/train.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import time from pathlib import Path import numpy as np @@ -34,7 +48,8 @@ class Experiment(ExperimentBase): if self.parallel > 1: model = paddle.DataParallel(model) - optimizer = paddle.optimizer.Adam(config.training.lr, parameters=model.parameters()) + optimizer = paddle.optimizer.Adam( + config.training.lr, parameters=model.parameters()) criterion = WaveFlowLoss(sigma=config.model.sigma) self.model = model @@ -46,20 +61,22 @@ class Experiment(ExperimentBase): args = self.args ljspeech_dataset = LJSpeech(args.data) - valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size) + valid_set, train_set = dataset.split(ljspeech_dataset, + config.data.valid_size) + + batch_fn = LJSpeechClipCollector(config.data.clip_frames, + config.data.hop_length) - batch_fn = LJSpeechClipCollector(config.data.clip_frames, config.data.hop_length) - if not self.parallel: train_loader = DataLoader( - train_set, - batch_size=config.data.batch_size, - shuffle=True, + train_set, + batch_size=config.data.batch_size, + shuffle=True, drop_last=True, collate_fn=batch_fn) else: sampler = DistributedBatchSampler( - train_set, + train_set, batch_size=config.data.batch_size, num_replicas=dist.get_world_size(), rank=dist.get_rank(), @@ -71,7 +88,7 @@ class Experiment(ExperimentBase): valid_batch_fn = LJSpeechCollector() valid_loader = DataLoader( valid_set, batch_size=1, collate_fn=valid_batch_fn) - + self.train_loader = train_loader self.valid_loader = valid_loader @@ -90,17 +107,19 @@ class Experiment(ExperimentBase): mel, wav = batch z, log_det_jocobian = self.compute_outputs(mel, wav) loss = self.criterion(z, log_det_jocobian) - loss.backward() + loss.backward() self.optimizer.step() iteration_time = time.time() - start loss_value = float(loss) msg = "Rank: {}, ".format(dist.get_rank()) msg += "step: {}, ".format(self.iteration) - msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time) + msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, + iteration_time) msg += "loss: {:>.6f}".format(loss_value) self.logger.info(msg) - self.visualizer.add_scalar("train/loss", loss_value, global_step=self.iteration) + self.visualizer.add_scalar( + "train/loss", loss_value, global_step=self.iteration) @mp_tools.rank_zero_only @paddle.no_grad() @@ -112,7 +131,8 @@ class Experiment(ExperimentBase): loss = self.criterion(z, log_det_jocobian) valid_losses.append(float(loss)) valid_loss = np.mean(valid_losses) - self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration) + self.visualizer.add_scalar( + "valid/loss", valid_loss, global_step=self.iteration) def main_sp(config, args): @@ -132,7 +152,7 @@ if __name__ == "__main__": config = get_cfg_defaults() parser = default_argument_parser() args = parser.parse_args() - if args.config: + if args.config: config.merge_from_file(args.config) if args.opts: config.merge_from_list(args.opts) diff --git a/examples/wavenet/config.py b/examples/wavenet/config.py index 58f9beb..658d416 100644 --- a/examples/wavenet/config.py +++ b/examples/wavenet/config.py @@ -1,19 +1,32 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from yacs.config import CfgNode as CN _C = CN() _C.data = CN( dict( - batch_size=8, # batch size - valid_size=16, # the first N examples are reserved for validation - sample_rate=22050, # Hz, sample rate - n_fft=2048, # fft frame size - win_length=1024, # window size + batch_size=8, # batch size + valid_size=16, # the first N examples are reserved for validation + sample_rate=22050, # Hz, sample rate + n_fft=2048, # fft frame size + win_length=1024, # window size hop_length=256, # hop size between ajacent frame # f_max=8000, # Hz, max frequency when converting to mel n_mels=80, # mel bands - train_clip_seconds=0.5, # audio clip length(in seconds) - ) -) + train_clip_seconds=0.5, # audio clip length(in seconds) + )) _C.model = CN( dict( @@ -21,24 +34,22 @@ _C.model = CN( n_stack=3, n_loop=10, filter_size=2, - residual_channels=128, # resiaudal channel in each flow + residual_channels=128, # resiaudal channel in each flow loss_type="mog", - output_dim=3, # single gaussian - log_scale_min=-9.0, - ) -) + output_dim=3, # single gaussian + log_scale_min=-9.0, )) _C.training = CN( dict( - lr=1e-3, # learning rates - anneal_rate=0.5, # learning rate decay rate - anneal_interval=200000, # decrese lr by annel_rate every anneal_interval steps - valid_interval=1000, # validation - save_interval=10000, # checkpoint - max_iteration=3000000, # max iteration to train - gradient_max_norm=100.0 # global norm of gradients - ) -) + lr=1e-3, # learning rates + anneal_rate=0.5, # learning rate decay rate + anneal_interval=200000, # decrese lr by annel_rate every anneal_interval steps + valid_interval=1000, # validation + save_interval=10000, # checkpoint + max_iteration=3000000, # max iteration to train + gradient_max_norm=100.0 # global norm of gradients + )) + def get_cfg_defaults(): """Get a yacs CfgNode object with default values for my_project.""" diff --git a/examples/wavenet/ljspeech.py b/examples/wavenet/ljspeech.py index 18dc388..d1d3c67 100644 --- a/examples/wavenet/ljspeech.py +++ b/examples/wavenet/ljspeech.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os from pathlib import Path import pickle @@ -9,19 +23,20 @@ from parakeet.data.batch import batch_spec, batch_wav from parakeet.data import dataset from parakeet.audio import AudioProcessor + class LJSpeech(Dataset): """A simple dataset adaptor for the processed ljspeech dataset.""" + def __init__(self, root): self.root = Path(root).expanduser() meta_data = pandas.read_csv( str(self.root / "metadata.csv"), sep="\t", header=None, - names=["fname", "frames", "samples"] - ) - + names=["fname", "frames", "samples"]) + records = [] - for row in meta_data.itertuples() : + for row in meta_data.itertuples(): mel_path = str(self.root / "mel" / (row.fname + ".npy")) wav_path = str(self.root / "wav" / (row.fname + ".npy")) records.append((mel_path, wav_path)) @@ -39,6 +54,7 @@ class LJSpeech(Dataset): class LJSpeechCollector(object): """A simple callable to batch LJSpeech examples.""" + def __init__(self, padding_value=0.): self.padding_value = padding_value @@ -48,15 +64,15 @@ class LJSpeechCollector(object): wavs = [example[1] for example in examples] mels = batch_spec(mels, pad_value=self.padding_value) wavs = batch_wav(wavs, pad_value=self.padding_value) - audio_starts = np.zeros((batch_size,), dtype=np.int64) + audio_starts = np.zeros((batch_size, ), dtype=np.int64) return mels, wavs, audio_starts class LJSpeechClipCollector(object): def __init__(self, clip_frames=65, hop_length=256): - self.clip_frames = clip_frames + self.clip_frames = clip_frames self.hop_length = hop_length - + def __call__(self, examples): mels = [] wavs = [] @@ -75,7 +91,8 @@ class LJSpeechClipCollector(object): mel, wav = example frames = mel.shape[-1] start = np.random.randint(0, frames - self.clip_frames) - wav_clip = wav[start * self.hop_length: (start + self.clip_frames) * self.hop_length] + wav_clip = wav[start * self.hop_length:(start + self.clip_frames) * + self.hop_length] return mel, wav_clip, start @@ -132,7 +149,3 @@ class DataCollector(object): audios = np.array(audios, dtype=np.float32) audio_starts = np.array(audio_starts, dtype=np.int64) return audios, mels, audio_starts - - - - diff --git a/examples/wavenet/preprocess.py b/examples/wavenet/preprocess.py index 29b140c..cc83727 100644 --- a/examples/wavenet/preprocess.py +++ b/examples/wavenet/preprocess.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import tqdm import csv @@ -23,7 +37,7 @@ class Transform(object): self.win_length = win_length self.hop_length = hop_length self.n_mels = n_mels - + self.spec_normalizer = UnitMagnitude(min=1e-5) def __call__(self, example): @@ -87,12 +101,8 @@ def create_dataset(config, input_dir, output_dir, verbose=True): output_dir = Path(output_dir).expanduser() output_dir.mkdir(exist_ok=True) - transform = Transform( - config.sample_rate, - config.n_fft, - config.win_length, - config.hop_length, - config.n_mels) + transform = Transform(config.sample_rate, config.n_fft, config.win_length, + config.hop_length, config.n_mels) file_names = [] for example in tqdm.tqdm(dataset): @@ -108,23 +118,35 @@ def create_dataset(config, input_dir, output_dir, verbose=True): np.save(str(mel_dir / base_name), mel) file_names.append((base_name, mel.shape[-1], audio.shape[-1])) - + meta_data = pd.DataFrame.from_records(file_names) - meta_data.to_csv(str(output_dir / "metadata.csv"), sep="\t", index=None, header=None) - print("saved meta data in to {}".format(os.path.join(output_dir, "metadata.csv"))) + meta_data.to_csv( + str(output_dir / "metadata.csv"), sep="\t", index=None, header=None) + print("saved meta data in to {}".format( + os.path.join(output_dir, "metadata.csv"))) print("Done!") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="create dataset") - parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") - parser.add_argument("--input", type=str, help="path of the ljspeech dataset") - parser.add_argument("--output", type=str, help="path to save output dataset") - parser.add_argument("--opts", nargs=argparse.REMAINDER, + parser.add_argument( + "--config", + type=str, + metavar="FILE", + help="extra config to overwrite the default config") + parser.add_argument( + "--input", type=str, help="path of the ljspeech dataset") + parser.add_argument( + "--output", type=str, help="path to save output dataset") + parser.add_argument( + "--opts", + nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" ) - parser.add_argument("-v", "--verbose", action="store_true", help="print msg") - + parser.add_argument( + "-v", "--verbose", action="store_true", help="print msg") + config = get_cfg_defaults() args = parser.parse_args() if args.config: diff --git a/examples/wavenet/synthesize.py b/examples/wavenet/synthesize.py index 80b96a2..c5a69fe 100644 --- a/examples/wavenet/synthesize.py +++ b/examples/wavenet/synthesize.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import numpy as np import soundfile as sf @@ -10,6 +24,7 @@ from parakeet.utils import layer_tools, checkpoint from config import get_cfg_defaults + def main(config, args): paddle.set_device(args.device) model = ConditionalWaveNet.from_pretrained(config, args.checkpoint_path) @@ -22,7 +37,8 @@ def main(config, args): for file_path in mel_dir.iterdir(): mel = np.load(str(file_path)) audio = model.predict(mel) - audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav") + audio_path = output_dir / ( + os.path.splitext(file_path.name)[0] + ".wav") sf.write(audio_path, audio, config.data.sample_rate) print("[synthesize] {} -> {}".format(file_path, audio_path)) @@ -30,17 +46,32 @@ def main(config, args): if __name__ == "__main__": config = get_cfg_defaults() - parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.") - parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") - parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.") - parser.add_argument("--input", type=str, help="path of directory containing mel spectrogram (in .npy format)") + parser = argparse.ArgumentParser( + description="generate mel spectrogram with TransformerTTS.") + parser.add_argument( + "--config", + type=str, + metavar="FILE", + help="extra config to overwrite the default config") + parser.add_argument( + "--checkpoint_path", type=str, help="path of the checkpoint to load.") + parser.add_argument( + "--input", + type=str, + help="path of directory containing mel spectrogram (in .npy format)") parser.add_argument("--output", type=str, help="path to save outputs") - parser.add_argument("--device", type=str, default="cpu", help="device type to use.") - parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") - parser.add_argument("-v", "--verbose", action="store_true", help="print msg") - + parser.add_argument( + "--device", type=str, default="cpu", help="device type to use.") + parser.add_argument( + "--opts", + nargs=argparse.REMAINDER, + help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="print msg") + args = parser.parse_args() - if args.config: + if args.config: config.merge_from_file(args.config) if args.opts: config.merge_from_list(args.opts) @@ -48,4 +79,4 @@ if __name__ == "__main__": print(config) print(args) - main(config, args) \ No newline at end of file + main(config, args) diff --git a/examples/wavenet/train.py b/examples/wavenet/train.py index 77c54e3..8e9bc0e 100644 --- a/examples/wavenet/train.py +++ b/examples/wavenet/train.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import time from pathlib import Path import math @@ -26,7 +40,7 @@ class Experiment(ExperimentBase): config = self.config model = ConditionalWaveNet( upsample_factors=config.model.upsample_factors, - n_stack=config.model.n_stack, + n_stack=config.model.n_stack, n_loop=config.model.n_loop, residual_channels=config.model.residual_channels, output_dim=config.model.output_dim, @@ -39,13 +53,13 @@ class Experiment(ExperimentBase): model = paddle.DataParallel(model) lr_scheduler = paddle.optimizer.lr.StepDecay( - config.training.lr, - config.training.anneal_interval, + config.training.lr, config.training.anneal_interval, config.training.anneal_rate) optimizer = paddle.optimizer.Adam( lr_scheduler, parameters=model.parameters(), - grad_clip=paddle.nn.ClipGradByGlobalNorm(config.training.gradient_max_norm)) + grad_clip=paddle.nn.ClipGradByGlobalNorm( + config.training.gradient_max_norm)) self.model = model self.model_core = model._layer if self.parallel else model @@ -56,7 +70,8 @@ class Experiment(ExperimentBase): args = self.args ljspeech_dataset = LJSpeech(args.data) - valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size) + valid_set, train_set = dataset.split(ljspeech_dataset, + config.data.valid_size) # convolutional net's causal padding size context_size = config.model.n_stack \ @@ -66,20 +81,21 @@ class Experiment(ExperimentBase): # frames used to compute loss frames_per_second = config.data.sample_rate // config.data.hop_length - train_clip_frames = math.ceil(config.data.train_clip_seconds * frames_per_second) - + train_clip_frames = math.ceil(config.data.train_clip_seconds * + frames_per_second) + num_frames = train_clip_frames + context_frames batch_fn = LJSpeechClipCollector(num_frames, config.data.hop_length) if not self.parallel: train_loader = DataLoader( - train_set, - batch_size=config.data.batch_size, - shuffle=True, + train_set, + batch_size=config.data.batch_size, + shuffle=True, drop_last=True, collate_fn=batch_fn) else: sampler = DistributedBatchSampler( - train_set, + train_set, batch_size=config.data.batch_size, shuffle=True, drop_last=True) @@ -89,7 +105,7 @@ class Experiment(ExperimentBase): valid_batch_fn = LJSpeechCollector() valid_loader = DataLoader( valid_set, batch_size=1, collate_fn=valid_batch_fn) - + self.train_loader = train_loader self.valid_loader = valid_loader @@ -101,20 +117,22 @@ class Experiment(ExperimentBase): self.model.train() self.optimizer.clear_grad() mel, wav, audio_starts = batch - + y = self.model(wav, mel, audio_starts) loss = self.model.loss(y, wav) - loss.backward() + loss.backward() self.optimizer.step() iteration_time = time.time() - start loss_value = float(loss) msg = "Rank: {}, ".format(dist.get_rank()) msg += "step: {}, ".format(self.iteration) - msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time) + msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, + iteration_time) msg += "loss: {:>.6f}".format(loss_value) self.logger.info(msg) - self.visualizer.add_scalar("train/loss", loss_value, global_step=self.iteration) + self.visualizer.add_scalar( + "train/loss", loss_value, global_step=self.iteration) @mp_tools.rank_zero_only @paddle.no_grad() @@ -126,7 +144,8 @@ class Experiment(ExperimentBase): loss = self.model.loss(y, wav) valid_losses.append(float(loss)) valid_loss = np.mean(valid_losses) - self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration) + self.visualizer.add_scalar( + "valid/loss", valid_loss, global_step=self.iteration) def main_sp(config, args): @@ -146,7 +165,7 @@ if __name__ == "__main__": config = get_cfg_defaults() parser = default_argument_parser() args = parser.parse_args() - if args.config: + if args.config: config.merge_from_file(args.config) if args.opts: config.merge_from_list(args.opts) diff --git a/parakeet/audio/audio.py b/parakeet/audio/audio.py index 93d4e6b..3795111 100644 --- a/parakeet/audio/audio.py +++ b/parakeet/audio/audio.py @@ -18,15 +18,16 @@ import numpy as np __all__ = ["AudioProcessor"] + class AudioProcessor(object): def __init__(self, - sample_rate:int, - n_fft:int, - win_length:int, - hop_length:int, - n_mels:int=80, - f_min:int=0, - f_max:int=None, + sample_rate: int, + n_fft: int, + win_length: int, + hop_length: int, + n_mels: int=80, + f_min: int=0, + f_max: int=None, window="hann", center=True, pad_mode="reflect"): @@ -40,7 +41,7 @@ class AudioProcessor(object): self.window = window self.center = center self.pad_mode = pad_mode - + # mel self.n_mels = n_mels self.f_min = f_min @@ -48,19 +49,18 @@ class AudioProcessor(object): self.mel_filter = self._create_mel_filter() self.inv_mel_filter = np.linalg.pinv(self.mel_filter) - + def _create_mel_filter(self): - mel_filter = librosa.filters.mel( - self.sample_rate, - self.n_fft, - n_mels=self.n_mels, - fmin=self.f_min, - fmax=self.f_max) + mel_filter = librosa.filters.mel(self.sample_rate, + self.n_fft, + n_mels=self.n_mels, + fmin=self.f_min, + fmax=self.f_max) return mel_filter def read_wav(self, filename): # resampling may occur - wav, _ = librosa.load(filename, sr=self.sample_rate) + wav, _ = librosa.load(filename, sr=self.sample_rate) return wav def write_wav(self, path, wav): @@ -69,7 +69,7 @@ class AudioProcessor(object): def stft(self, wav): D = librosa.core.stft( wav, - n_fft = self.n_fft, + n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window, diff --git a/parakeet/audio/spec_normalizer.py b/parakeet/audio/spec_normalizer.py index 08cea1b..069c453 100644 --- a/parakeet/audio/spec_normalizer.py +++ b/parakeet/audio/spec_normalizer.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This modules contains normalizers for spectrogram magnitude. @@ -19,22 +32,24 @@ __all__ = ["NormalizerBase", "LogMagnitude", "UnitMagnitude"] class NormalizerBase(object): def transform(self, spec): raise NotImplementedError("transform must be implemented") - + def inverse(self, normalized): raise NotImplementedError("inverse must be implemented") + class LogMagnitude(NormalizerBase): """ This is a simple normalizer used in Waveglow, Waveflow, tacotron2... """ + def __init__(self, min=1e-7): self.min = min - + def transform(self, x): x = np.maximum(x, self.min) x = np.log(x) return x - + def inverse(self, x): return np.exp(x) @@ -44,15 +59,16 @@ class UnitMagnitude(NormalizerBase): """ This is the normalizer used in the """ + def __init__(self, min=1e-5): self.min = min - + def transform(self, x): db_scale = 20 * np.log10(np.maximum(self.min, x)) - 20 normalized = (db_scale + 100) / 100 clipped = np.clip(normalized, 0, 1) return clipped - + def inverse(self, x): denormalized = np.clip(x, 0, 1) * 100 - 100 out = np.exp((denormalized + 20) / 20 * np.log(10)) diff --git a/parakeet/data/batch.py b/parakeet/data/batch.py index 1551124..4c5be61 100644 --- a/parakeet/data/batch.py +++ b/parakeet/data/batch.py @@ -18,10 +18,15 @@ Batch functions for text sequences, audio and spectrograms are provided. import numpy as np __all__ = [ - "batch_text_id", "batch_wav", "batch_spec", - "TextIDBatcher", "WavBatcher", "SpecBatcher", + "batch_text_id", + "batch_wav", + "batch_spec", + "TextIDBatcher", + "WavBatcher", + "SpecBatcher", ] + class TextIDBatcher(object): """A wrapper class for `batch_text_id`.""" @@ -99,8 +104,8 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32): pad_len = max_len - example.shape[-1] batch.append( np.pad(example, [(0, pad_len)], - mode='constant', - constant_values=pad_value)) + mode='constant', + constant_values=pad_value)) return np.array(batch, dtype=dtype) @@ -113,7 +118,11 @@ class SpecBatcher(object): self.time_major = time_major def __call__(self, minibatch): - out = batch_spec(minibatch, pad_value=self.pad_value, time_major=self.time_major, dtype=self.dtype) + out = batch_spec( + minibatch, + pad_value=self.pad_value, + time_major=self.time_major, + dtype=self.dtype) return out @@ -130,7 +139,8 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32): """ # assume (F, T) or (T, F) peek_example = minibatch[0] - assert len(peek_example.shape) == 2, "we only handles mono channel spectrogram" + assert len( + peek_example.shape) == 2, "we only handles mono channel spectrogram" # assume (F, n_frame) or (n_frame, F) time_idx = 0 if time_major else -1 @@ -143,11 +153,11 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32): if time_major: batch.append( np.pad(example, [(0, pad_len), (0, 0)], - mode='constant', - constant_values=pad_value)) + mode='constant', + constant_values=pad_value)) else: batch.append( np.pad(example, [(0, 0), (0, pad_len)], - mode='constant', - constant_values=pad_value)) + mode='constant', + constant_values=pad_value)) return np.array(batch, dtype=dtype) diff --git a/parakeet/data/dataset.py b/parakeet/data/dataset.py index de9b40c..a188767 100644 --- a/parakeet/data/dataset.py +++ b/parakeet/data/dataset.py @@ -17,17 +17,25 @@ import paddle from paddle.io import Dataset __all__ = [ - "split", "TransformDataset", "CacheDataset", "TupleDataset", - "DictDataset", "SliceDataset", "SubsetDataset", "FilterDataset", + "split", + "TransformDataset", + "CacheDataset", + "TupleDataset", + "DictDataset", + "SliceDataset", + "SubsetDataset", + "FilterDataset", "ChainDataset", ] + def split(dataset, first_size): """A utility function to split a dataset into two datasets.""" first = SliceDataset(dataset, 0, first_size) second = SliceDataset(dataset, first_size, len(dataset)) return first, second + class TransformDataset(Dataset): def __init__(self, dataset, transform): """Dataset which is transformed from another with a transform. @@ -141,7 +149,7 @@ class DictDataset(Dataset): for i in six.moves.range(length)] else: return batches - + def __len__(self): return self._length diff --git a/parakeet/datasets/__init__.py b/parakeet/datasets/__init__.py index de7be70..e75da0b 100644 --- a/parakeet/datasets/__init__.py +++ b/parakeet/datasets/__init__.py @@ -1,2 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from parakeet.datasets.common import * from parakeet.datasets.ljspeech import * \ No newline at end of file diff --git a/parakeet/datasets/common.py b/parakeet/datasets/common.py index e0d91a3..a1d16d6 100644 --- a/parakeet/datasets/common.py +++ b/parakeet/datasets/common.py @@ -1,9 +1,24 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from paddle.io import Dataset import os import librosa __all__ = ["AudioFolderDataset"] + class AudioFolderDataset(Dataset): def __init__(self, path, sample_rate, extension="wav"): self.root = os.path.expanduser(path) @@ -19,5 +34,5 @@ class AudioFolderDataset(Dataset): def __getitem__(self, i): file_name = self.file_names[i] - y, _ = librosa.load(file_name, sr=self.sample_rate) # pylint: disable=unused-variable + y, _ = librosa.load(file_name, sr=self.sample_rate) # pylint: disable=unused-variable return y diff --git a/parakeet/datasets/ljspeech.py b/parakeet/datasets/ljspeech.py index 9c2e0c3..a37863f 100644 --- a/parakeet/datasets/ljspeech.py +++ b/parakeet/datasets/ljspeech.py @@ -1,8 +1,23 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from paddle.io import Dataset from pathlib import Path __all__ = ["LJSpeechMetaData"] + class LJSpeechMetaData(Dataset): def __init__(self, root): self.root = Path(root).expanduser() @@ -22,4 +37,3 @@ class LJSpeechMetaData(Dataset): def __len__(self): return len(self.records) - diff --git a/parakeet/frontend/__init__.py b/parakeet/frontend/__init__.py index cee73c1..2d06dda 100644 --- a/parakeet/frontend/__init__.py +++ b/parakeet/frontend/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from parakeet.frontend.vocab import * from parakeet.frontend.phonectic import * from parakeet.frontend.punctuation import * diff --git a/parakeet/frontend/normalizer/__init__.py b/parakeet/frontend/normalizer/__init__.py index f098650..37fd580 100644 --- a/parakeet/frontend/normalizer/__init__.py +++ b/parakeet/frontend/normalizer/__init__.py @@ -1,2 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from parakeet.frontend.normalizer.normalizer import * from parakeet.frontend.normalizer.numbers import * diff --git a/parakeet/frontend/normalizer/abbrrviation.py b/parakeet/frontend/normalizer/abbrrviation.py index e69de29..9118340 100644 --- a/parakeet/frontend/normalizer/abbrrviation.py +++ b/parakeet/frontend/normalizer/abbrrviation.py @@ -0,0 +1,14 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/parakeet/frontend/normalizer/acronyms.py b/parakeet/frontend/normalizer/acronyms.py index e69de29..9118340 100644 --- a/parakeet/frontend/normalizer/acronyms.py +++ b/parakeet/frontend/normalizer/acronyms.py @@ -0,0 +1,14 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/parakeet/frontend/normalizer/width.py b/parakeet/frontend/normalizer/width.py index 440557f..b1598af 100644 --- a/parakeet/frontend/normalizer/width.py +++ b/parakeet/frontend/normalizer/width.py @@ -1,8 +1,22 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + def full2half_width(ustr): half = [] for u in ustr: num = ord(u) - if num == 0x3000: # 全角空格变半角 + if num == 0x3000: # 全角空格变半角 num = 32 elif 0xFF01 <= num <= 0xFF5E: num -= 0xfee0 @@ -10,15 +24,16 @@ def full2half_width(ustr): half.append(u) return ''.join(half) + def half2full_width(ustr): full = [] for u in ustr: num = ord(u) - if num == 32: # 半角空格变全角 + if num == 32: # 半角空格变全角 num = 0x3000 elif 0x21 <= num <= 0x7E: num += 0xfee0 - u = chr(num) # to unicode + u = chr(num) # to unicode full.append(u) - - return ''.join(full) \ No newline at end of file + + return ''.join(full) diff --git a/parakeet/frontend/punctuation.py b/parakeet/frontend/punctuation.py index 9984970..099e759 100644 --- a/parakeet/frontend/punctuation.py +++ b/parakeet/frontend/punctuation.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import abc import string @@ -13,15 +27,8 @@ EN_PUNCT = [ "!", ] -CN_PUNCT = [ - "、", - ",", - ";", - ":", - "。", - "?", - "!" -] +CN_PUNCT = ["、", ",", ";", ":", "。", "?", "!"] + def get_punctuations(lang): if lang == "en": @@ -30,4 +37,3 @@ def get_punctuations(lang): return CN_PUNCT else: raise ValueError(f"language {lang} Not supported") - diff --git a/parakeet/models/transformer_tts.py b/parakeet/models/transformer_tts.py index f84a9f8..c7f0ccd 100644 --- a/parakeet/models/transformer_tts.py +++ b/parakeet/models/transformer_tts.py @@ -559,7 +559,7 @@ class TransformerTTS(nn.Layer): @classmethod def from_pretrained(cls, frontend, config, checkpoint_path): model = TransformerTTS( - frontend, + frontend, d_encoder=config.model.d_encoder, d_decoder=config.model.d_decoder, d_mel=config.data.d_mel, @@ -575,11 +575,12 @@ class TransformerTTS(nn.Layer): decoder_prenet_dropout=config.model.decoder_prenet_dropout, dropout=config.model.dropout) - iteration = checkpoint.load_parameters(model, checkpoint_path=checkpoint_path) + iteration = checkpoint.load_parameters( + model, checkpoint_path=checkpoint_path) drop_n_heads = scheduler.StepWise(config.training.drop_n_heads) reduction_factor = scheduler.StepWise(config.training.reduction_factor) model.set_constants( - reduction_factor=reduction_factor(iteration), + reduction_factor=reduction_factor(iteration), drop_n_heads=drop_n_heads(iteration)) return model diff --git a/parakeet/models/waveflow.py b/parakeet/models/waveflow.py index d58127b..625e61f 100644 --- a/parakeet/models/waveflow.py +++ b/parakeet/models/waveflow.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math import numpy as np from typing import List, Union, Tuple @@ -11,6 +25,7 @@ from parakeet.modules import geometry as geo __all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"] + def fold(x, n_group): r"""Fold audio or spectrogram's temporal dimension in to groups. @@ -31,6 +46,7 @@ def fold(x, n_group): new_shape = spatial_shape + [time_steps // n_group, n_group] return paddle.reshape(x, new_shape) + class UpsampleNet(nn.LayerList): """Layer to upsample mel spectrogram to the same temporal resolution with the corresponding waveform. @@ -60,6 +76,7 @@ class UpsampleNet(nn.LayerList): --------- ``librosa.core.stft`` """ + def __init__(self, upsample_factors): super(UpsampleNet, self).__init__() for factor in upsample_factors: @@ -67,16 +84,18 @@ class UpsampleNet(nn.LayerList): init = I.Uniform(-std, std) self.append( nn.utils.weight_norm( - nn.Conv2DTranspose(1, 1, (3, 2 * factor), + nn.Conv2DTranspose( + 1, + 1, (3, 2 * factor), padding=(1, factor // 2), stride=(1, factor), weight_attr=init, bias_attr=init))) - + # upsample factors self.upsample_factor = np.prod(upsample_factors) self.upsample_factors = upsample_factors - + def forward(self, x, trim_conv_artifact=False): r"""Forward pass of the ``UpsampleNet``. @@ -131,38 +150,47 @@ class ResidualBlock(nn.Layer): dilations : int Dilations of the Convolution2d applied to the input. """ + def __init__(self, channels, cond_channels, kernel_size, dilations): super(ResidualBlock, self).__init__() # input conv std = math.sqrt(1 / channels * np.prod(kernel_size)) init = I.Uniform(-std, std) - receptive_field = [1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)] + receptive_field = [ + 1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations) + ] rh, rw = receptive_field - paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2] # causal & same - conv = nn.Conv2D(channels, 2 * channels, kernel_size, - padding=paddings, - dilation=dilations, - weight_attr=init, - bias_attr=init) + paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2] # causal & same + conv = nn.Conv2D( + channels, + 2 * channels, + kernel_size, + padding=paddings, + dilation=dilations, + weight_attr=init, + bias_attr=init) self.conv = nn.utils.weight_norm(conv) self.rh = rh self.rw = rw self.dilations = dilations - + # condition projection std = math.sqrt(1 / cond_channels) init = I.Uniform(-std, std) - condition_proj = nn.Conv2D(cond_channels, 2 * channels, (1, 1), - weight_attr=init, bias_attr=init) + condition_proj = nn.Conv2D( + cond_channels, + 2 * channels, (1, 1), + weight_attr=init, + bias_attr=init) self.condition_proj = nn.utils.weight_norm(condition_proj) - + # parametric residual & skip connection std = math.sqrt(1 / channels) init = I.Uniform(-std, std) - out_proj = nn.Conv2D(channels, 2 * channels, (1, 1), - weight_attr=init, bias_attr=init) + out_proj = nn.Conv2D( + channels, 2 * channels, (1, 1), weight_attr=init, bias_attr=init) self.out_proj = nn.utils.weight_norm(out_proj) - + def forward(self, x, condition): """Compute output for a whole folded sequence. @@ -185,10 +213,10 @@ class ResidualBlock(nn.Layer): x_in = x x = self.conv(x) x += self.condition_proj(condition) - + content, gate = paddle.chunk(x, 2, axis=1) x = paddle.tanh(content) * F.sigmoid(gate) - + x = self.out_proj(x) res, skip = paddle.chunk(x, 2, axis=1) res = x_in + res @@ -249,7 +277,7 @@ class ResidualBlock(nn.Layer): content, gate = paddle.chunk(x_row, 2, axis=1) x_row = paddle.tanh(content) * F.sigmoid(gate) - + x_row = self.out_proj(x_row) res, skip = paddle.chunk(x_row, 2, axis=1) res = x_row_in + res @@ -290,20 +318,23 @@ class ResidualNet(nn.LayerList): ValueError If the length of dilations_h does not equals n_layers. """ - def __init__(self, - n_layer: int, - residual_channels: int, - condition_channels: int, - kernel_size: Tuple[int], + + def __init__(self, + n_layer: int, + residual_channels: int, + condition_channels: int, + kernel_size: Tuple[int], dilations_h: List[int]): if len(dilations_h) != n_layer: - raise ValueError("number of dilations_h should equals num of layers") + raise ValueError( + "number of dilations_h should equals num of layers") super(ResidualNet, self).__init__() for i in range(n_layer): - dilation = (dilations_h[i], 2 ** i) - layer = ResidualBlock(residual_channels, condition_channels, kernel_size, dilation) + dilation = (dilations_h[i], 2**i) + layer = ResidualBlock(residual_channels, condition_channels, + kernel_size, dilation) self.append(layer) - + def forward(self, x, condition): """Comput the output of given the input and the condition. @@ -332,7 +363,7 @@ class ResidualNet(nn.LayerList): """ for layer in self: layer.start_sequence() - + def add_input(self, x_row, condition_row): """Compute the output for a row and update the buffers. @@ -386,33 +417,37 @@ class Flow(nn.Layer): Number of timesteps to the folded into a group. """ dilations_dict = { - 8: [1, 1, 1, 1, 1, 1, 1, 1], - 16: [1, 1, 1, 1, 1, 1, 1, 1], - 32: [1, 2, 4, 1, 2, 4, 1, 2], - 64: [1, 2, 4, 8, 16, 1, 2, 4], - 128: [1, 2, 4, 8, 16, 32, 64, 1] + 8: [1, 1, 1, 1, 1, 1, 1, 1], + 16: [1, 1, 1, 1, 1, 1, 1, 1], + 32: [1, 2, 4, 1, 2, 4, 1, 2], + 64: [1, 2, 4, 8, 16, 1, 2, 4], + 128: [1, 2, 4, 8, 16, 32, 64, 1] } - + def __init__(self, n_layers, channels, mel_bands, kernel_size, n_group): super(Flow, self).__init__() # input projection self.input_proj = nn.utils.weight_norm( - nn.Conv2D(1, channels, (1, 1), - weight_attr=I.Uniform(-1., 1.), - bias_attr=I.Uniform(-1., 1.))) - + nn.Conv2D( + 1, + channels, (1, 1), + weight_attr=I.Uniform(-1., 1.), + bias_attr=I.Uniform(-1., 1.))) + # residual net - self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size, + self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size, self.dilations_dict[n_group]) - + # output projection - self.output_proj = nn.Conv2D(channels, 2, (1, 1), - weight_attr=I.Constant(0.), - bias_attr=I.Constant(0.)) - + self.output_proj = nn.Conv2D( + channels, + 2, (1, 1), + weight_attr=I.Constant(0.), + bias_attr=I.Constant(0.)) + # specs self.n_group = n_group - + def _predict_parameters(self, x, condition): x = self.input_proj(x) x = self.resnet(x, condition) @@ -421,11 +456,11 @@ class Flow(nn.Layer): return logs, b def _transform(self, x, logs, b): - z_0 = x[:, :, :1, :] # the first row, just copy it - z_out = x[:, :, 1:, :] * paddle.exp(logs) + b + z_0 = x[:, :, :1, :] # the first row, just copy it + z_out = x[:, :, 1:, :] * paddle.exp(logs) + b z_out = paddle.concat([z_0, z_out], axis=2) return z_out - + def forward(self, x, condition): """Probability density estimation. It is done by inversely transform a sample from p(X) into a sample from p(Z). @@ -452,8 +487,8 @@ class Flow(nn.Layer): transformation from x to z. """ # (B, C, H-1, W) - logs, b = self._predict_parameters( - x[:, :, :-1, :], condition[:, :, 1:, :]) + logs, b = self._predict_parameters(x[:, :, :-1, :], + condition[:, :, 1:, :]) z = self._transform(x, logs, b) return z, (logs, b) @@ -467,7 +502,7 @@ class Flow(nn.Layer): def _inverse_transform_row(self, z_row, logs, b): x_row = (z_row - b) * paddle.exp(-logs) return x_row - + def _inverse_row(self, z_row, x_row, condition_row): logs, b = self._predict_row_parameters(x_row, condition_row) x_next_row = self._inverse_transform_row(z_row, logs, b) @@ -475,7 +510,7 @@ class Flow(nn.Layer): def _start_sequence(self): self.resnet.start_sequence() - + def inverse(self, z, condition): """Sampling from the the distrition p(X). It is done by sample form p(Z) and transform the sample. It is a auto regressive transformation. @@ -510,15 +545,16 @@ class Flow(nn.Layer): self._start_sequence() for i in range(1, self.n_group): - x_row = x[-1] # actuallt i-1:i - z_row = z[:, :, i:i+1, :] - condition_row = condition[:, :, i:i+1, :] + x_row = x[-1] # actuallt i-1:i + z_row = z[:, :, i:i + 1, :] + condition_row = condition[:, :, i:i + 1, :] - x_next_row, (logs, b) = self._inverse_row(z_row, x_row, condition_row) + x_next_row, (logs, b) = self._inverse_row(z_row, x_row, + condition_row) x.append(x_next_row) logs_list.append(logs) b_list.append(b) - + x = paddle.concat(x, 2) logs = paddle.concat(logs_list, 2) b = paddle.concat(b_list, 2) @@ -549,21 +585,25 @@ class WaveFlow(nn.LayerList): kernel_size : Union[int, List[int]] Kernel size of the convolution layer in each ResidualBlock. """ - def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, kernel_size): + + def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, + kernel_size): if n_group % 2 or n_flows % 2: - raise ValueError("number of flows and number of group must be even " - "since a permutation along group among flows is used.") + raise ValueError( + "number of flows and number of group must be even " + "since a permutation along group among flows is used.") super(WaveFlow, self).__init__() for _ in range(n_flows): - self.append(Flow(n_layers, channels, mel_bands, kernel_size, n_group)) - + self.append( + Flow(n_layers, channels, mel_bands, kernel_size, n_group)) + # permutations in h self.perms = self._create_perm(n_group, n_flows) # specs self.n_group = n_group self.n_flows = n_flows - + def _create_perm(self, n_group, n_flows): indices = list(range(n_group)) half = n_group // 2 @@ -572,20 +612,21 @@ class WaveFlow(nn.LayerList): if i < n_flows // 2: perms.append(indices[::-1]) else: - perm = list(reversed(indices[:half])) + list(reversed(indices[half:])) + perm = list(reversed(indices[:half])) + list( + reversed(indices[half:])) perms.append(perm) return perms - + def _trim(self, x, condition): assert condition.shape[-1] >= x.shape[-1] pruned_len = int(x.shape[-1] // self.n_group * self.n_group) - + if x.shape[-1] > pruned_len: x = x[:, :pruned_len] if condition.shape[-1] > pruned_len: condition = condition[:, :, :pruned_len] return x, condition - + def forward(self, x, condition): """Probability density estimation of random variable x given the condition. @@ -610,21 +651,23 @@ class WaveFlow(nn.LayerList): # x: (B, T) # condition: (B, C, T) upsampled condition x, condition = self._trim(x, condition) - + # to (B, C, h, T//h) layout - x = paddle.unsqueeze(paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1) - condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2]) - + x = paddle.unsqueeze( + paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1) + condition = paddle.transpose( + fold(condition, self.n_group), [0, 1, 3, 2]) + # flows logs_list = [] for i, layer in enumerate(self): - x, (logs, b) = layer(x, condition) + x, (logs, b) = layer(x, condition) logs_list.append(logs) # permute paddle has no shuffle dim x = geo.shuffle_dim(x, 2, perm=self.perms[i]) condition = geo.shuffle_dim(condition, 2, perm=self.perms[i]) - z = paddle.squeeze(x, 1) # (B, H, W) + z = paddle.squeeze(x, 1) # (B, H, W) batch_size = z.shape[0] z = paddle.reshape(paddle.transpose(z, [0, 2, 1]), [batch_size, -1]) @@ -654,8 +697,10 @@ class WaveFlow(nn.LayerList): z, condition = self._trim(z, condition) # to (B, C, h, T//h) layout - z = paddle.unsqueeze(paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1) - condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2]) + z = paddle.unsqueeze( + paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1) + condition = paddle.transpose( + fold(condition, self.n_group), [0, 1, 3, 2]) # reverse it flow by flow for i in reversed(range(self.n_flows)): @@ -663,7 +708,7 @@ class WaveFlow(nn.LayerList): condition = geo.shuffle_dim(condition, 2, perm=self.perms[i]) z, (logs, b) = self[i].inverse(z, condition) - x = paddle.squeeze(z, 1) # (B, H, W) + x = paddle.squeeze(z, 1) # (B, H, W) batch_size = x.shape[0] x = paddle.reshape(paddle.transpose(x, [0, 2, 1]), [batch_size, -1]) return x @@ -695,23 +740,24 @@ class ConditionalWaveFlow(nn.LayerList): kernel_size : Union[int, List[int]] Kernel size of the convolution layer in each ResidualBlock. """ - def __init__(self, - upsample_factors: List[int], - n_flows: int, - n_layers: int, - n_group: int, - channels: int, - n_mels: int, - kernel_size: Union[int, List[int]]): + + def __init__(self, + upsample_factors: List[int], + n_flows: int, + n_layers: int, + n_group: int, + channels: int, + n_mels: int, + kernel_size: Union[int, List[int]]): super(ConditionalWaveFlow, self).__init__() self.encoder = UpsampleNet(upsample_factors) self.decoder = WaveFlow( - n_flows=n_flows, - n_layers=n_layers, - n_group=n_group, - channels=channels, - mel_bands=n_mels, - kernel_size=kernel_size) + n_flows=n_flows, + n_layers=n_layers, + n_group=n_group, + channels=channels, + mel_bands=n_mels, + kernel_size=kernel_size) def forward(self, audio, mel): """Compute the transformed random variable z (x to z) and the log of @@ -737,7 +783,7 @@ class ConditionalWaveFlow(nn.LayerList): condition = self.encoder(mel) z, log_det_jacobian = self.decoder(audio, condition) return z, log_det_jacobian - + @paddle.no_grad() def infer(self, mel): r"""Generate raw audio given mel spectrogram. @@ -752,12 +798,12 @@ class ConditionalWaveFlow(nn.LayerList): Tensor : [shape=(B, T)] The synthesized audio, where``T <= T_mel \* upsample_factors``. """ - condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T) + condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T) batch_size, _, time_steps = condition.shape z = paddle.randn([batch_size, time_steps], dtype=mel.dtype) x = self.decoder.inverse(z, condition) return x - + @paddle.no_grad() def predict(self, mel): """Generate raw audio given mel spectrogram. @@ -777,7 +823,7 @@ class ConditionalWaveFlow(nn.LayerList): audio = self.infer(mel) audio = audio[0].numpy() return audio - + @classmethod def from_pretrained(cls, config, checkpoint_path): """Build a ConditionalWaveFlow model from a pretrained model. @@ -795,14 +841,13 @@ class ConditionalWaveFlow(nn.LayerList): ConditionalWaveFlow The model built from pretrained result. """ - model = cls( - upsample_factors=config.model.upsample_factors, - n_flows=config.model.n_flows, - n_layers=config.model.n_layers, - n_group=config.model.n_group, - channels=config.model.channels, - n_mels=config.data.n_mels, - kernel_size=config.model.kernel_size) + model = cls(upsample_factors=config.model.upsample_factors, + n_flows=config.model.n_flows, + n_layers=config.model.n_layers, + n_group=config.model.n_group, + channels=config.model.channels, + n_mels=config.data.n_mels, + kernel_size=config.model.kernel_size) checkpoint.load_parameters(model, checkpoint_path=checkpoint_path) return model @@ -816,6 +861,7 @@ class WaveFlowLoss(nn.Layer): The standard deviation of the gaussian noise used in WaveFlow, by default 1.0. """ + def __init__(self, sigma=1.0): super(WaveFlowLoss, self).__init__() self.sigma = sigma @@ -839,6 +885,7 @@ class WaveFlowLoss(nn.Layer): Tensor [shape=(1,)] The loss. """ - loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian + loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma + ) - log_det_jacobian loss = loss / np.prod(z.shape) return loss + self.const diff --git a/parakeet/models/wavenet.py b/parakeet/models/wavenet.py index 8e6f272..5ff3435 100644 --- a/parakeet/models/wavenet.py +++ b/parakeet/models/wavenet.py @@ -18,7 +18,7 @@ from typing import Union, Sequence, List from tqdm import trange import numpy as np -import paddle +import paddle from paddle import nn from paddle.nn import functional as F import paddle.fluid.initializer as I @@ -30,6 +30,7 @@ from parakeet.utils import checkpoint, layer_tools __all__ = ["WaveNet", "ConditionalWaveNet"] + def crop(x, audio_start, audio_length): """Crop the upsampled condition to match audio_length. @@ -96,6 +97,7 @@ class UpsampleNet(nn.LayerList): --------- ``librosa.core.stft`` """ + def __init__(self, upscale_factors=[16, 16]): super(UpsampleNet, self).__init__() self.upscale_factors = list(upscale_factors) @@ -106,9 +108,11 @@ class UpsampleNet(nn.LayerList): for factor in self.upscale_factors: self.append( nn.utils.weight_norm( - nn.Conv2DTranspose(1, 1, - kernel_size=(3, 2 * factor), - stride=(1, factor), + nn.Conv2DTranspose( + 1, + 1, + kernel_size=(3, 2 * factor), + stride=(1, factor), padding=(1, factor // 2)))) def forward(self, x): @@ -159,29 +163,34 @@ class ResidualBlock(nn.Layer): dilation :int Dilation of the internal convolution cells. """ - def __init__(self, - residual_channels: int, - condition_dim: int, + + def __init__(self, + residual_channels: int, + condition_dim: int, filter_size: Union[int, Sequence[int]], dilation: int): - + super(ResidualBlock, self).__init__() dilated_channels = 2 * residual_channels # following clarinet's implementation, we do not have parametric residual # & skip connection. - _filter_size = filter_size[0] if isinstance(filter_size, (list, tuple)) else filter_size + _filter_size = filter_size[0] if isinstance(filter_size, ( + list, tuple)) else filter_size std = math.sqrt(1 / (_filter_size * residual_channels)) - conv = Conv1dCell(residual_channels, - dilated_channels, - filter_size, - dilation=dilation, - weight_attr=I.Normal(scale=std)) + conv = Conv1dCell( + residual_channels, + dilated_channels, + filter_size, + dilation=dilation, + weight_attr=I.Normal(scale=std)) self.conv = nn.utils.weight_norm(conv) std = math.sqrt(1 / condition_dim) - condition_proj = Conv1dCell(condition_dim, dilated_channels, (1,), - weight_attr=I.Normal(scale=std)) + condition_proj = Conv1dCell( + condition_dim, + dilated_channels, (1, ), + weight_attr=I.Normal(scale=std)) self.condition_proj = nn.utils.weight_norm(condition_proj) self.filter_size = filter_size @@ -309,10 +318,11 @@ class ResidualNet(nn.LayerList): Kernel size of the internal ``Conv1dCell`` of each ``ResidualBlock``. """ - def __init__(self, - n_stack: int, - n_loop: int, - residual_channels: int, + + def __init__(self, + n_stack: int, + n_loop: int, + residual_channels: int, condition_dim: int, filter_size: int): super(ResidualNet, self).__init__() @@ -320,7 +330,9 @@ class ResidualNet(nn.LayerList): dilations = [2**i for i in range(n_loop)] * n_stack self.context_size = 1 + sum(dilations) for dilation in dilations: - self.append(ResidualBlock(residual_channels, condition_dim, filter_size, dilation)) + self.append( + ResidualBlock(residual_channels, condition_dim, filter_size, + dilation)) def forward(self, x, condition=None): """Forward pass of ``ResidualNet``. @@ -345,7 +357,7 @@ class ResidualNet(nn.LayerList): skip_connections = skip else: skip_connections = paddle.scale(skip_connections + skip, - math.sqrt(0.5)) + math.sqrt(0.5)) return skip_connections def start_sequence(self): @@ -381,7 +393,7 @@ class ResidualNet(nn.LayerList): skip_connections = skip else: skip_connections = paddle.scale(skip_connections + skip, - math.sqrt(0.5)) + math.sqrt(0.5)) return skip_connections @@ -426,6 +438,7 @@ class WaveNet(nn.Layer): This is only used for computing loss when ``loss_type`` is "mog", If the predicted log scale is less than -9.0, it is clipped at -9.0. """ + def __init__(self, n_stack, n_loop, residual_channels, output_dim, condition_dim, filter_size, loss_type, log_scale_min): @@ -437,19 +450,24 @@ class WaveNet(nn.Layer): else: if (output_dim % 3 != 0): raise ValueError( - "with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".format(output_dim)) - self.embed = nn.utils.weight_norm(nn.Linear(1, residual_channels), dim=1) + "with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}". + format(output_dim)) + self.embed = nn.utils.weight_norm( + nn.Linear(1, residual_channels), dim=1) self.resnet = ResidualNet(n_stack, n_loop, residual_channels, condition_dim, filter_size) self.context_size = self.resnet.context_size skip_channels = residual_channels # assume the same channel - self.proj1 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1) - self.proj2 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1) + self.proj1 = nn.utils.weight_norm( + nn.Linear(skip_channels, skip_channels), dim=1) + self.proj2 = nn.utils.weight_norm( + nn.Linear(skip_channels, skip_channels), dim=1) # if loss_type is softmax, output_dim is n_vocab of waveform magnitude. # if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev) - self.proj3 = nn.utils.weight_norm(nn.Linear(skip_channels, output_dim), dim=1) + self.proj3 = nn.utils.weight_norm( + nn.Linear(skip_channels, output_dim), dim=1) self.loss_type = loss_type self.output_dim = output_dim @@ -781,26 +799,28 @@ class ConditionalWaveNet(nn.Layer): This is only used for computing loss when ``loss_type`` is "mog", If the predicted log scale is less than -9.0, it is clipped at -9.0. """ - def __init__(self, - upsample_factors: List[int], - n_stack: int, - n_loop: int, - residual_channels: int, + + def __init__(self, + upsample_factors: List[int], + n_stack: int, + n_loop: int, + residual_channels: int, output_dim: int, - n_mels: int, - filter_size: int=2, - loss_type: str="mog", + n_mels: int, + filter_size: int=2, + loss_type: str="mog", log_scale_min: float=-9.0): super(ConditionalWaveNet, self).__init__() self.encoder = UpsampleNet(upsample_factors) - self.decoder = WaveNet(n_stack=n_stack, - n_loop=n_loop, - residual_channels=residual_channels, - output_dim=output_dim, - condition_dim=n_mels, - filter_size=filter_size, - loss_type=loss_type, - log_scale_min=log_scale_min) + self.decoder = WaveNet( + n_stack=n_stack, + n_loop=n_loop, + residual_channels=residual_channels, + output_dim=output_dim, + condition_dim=n_mels, + filter_size=filter_size, + loss_type=loss_type, + log_scale_min=log_scale_min) def forward(self, audio, mel, audio_start): """Compute the output distribution given the mel spectrogram and the input(for teacher force training). @@ -895,11 +915,11 @@ class ConditionalWaveNet(nn.Layer): self.decoder.start_sequence() x_t = paddle.zeros((batch_size, ), dtype=mel.dtype) for i in trange(time_steps): - c_t = condition[:, :, i] # (B, C) - y_t = self.decoder.add_input(x_t, c_t) #(B, C) + c_t = condition[:, :, i] # (B, C) + y_t = self.decoder.add_input(x_t, c_t) #(B, C) y_t = paddle.unsqueeze(y_t, 1) - x_t = self.sample(y_t) # (B, 1) - x_t = paddle.squeeze(x_t, 1) #(B,) + x_t = self.sample(y_t) # (B, 1) + x_t = paddle.squeeze(x_t, 1) #(B,) samples.append(x_t) samples = paddle.stack(samples, -1) return samples @@ -943,16 +963,15 @@ class ConditionalWaveNet(nn.Layer): ConditionalWaveNet The model built from pretrained result. """ - model = cls( - upsample_factors=config.model.upsample_factors, - n_stack=config.model.n_stack, - n_loop=config.model.n_loop, - residual_channels=config.model.residual_channels, - output_dim=config.model.output_dim, - n_mels=config.data.n_mels, - filter_size=config.model.filter_size, - loss_type=config.model.loss_type, - log_scale_min=config.model.log_scale_min) + model = cls(upsample_factors=config.model.upsample_factors, + n_stack=config.model.n_stack, + n_loop=config.model.n_loop, + residual_channels=config.model.residual_channels, + output_dim=config.model.output_dim, + n_mels=config.data.n_mels, + filter_size=config.model.filter_size, + loss_type=config.model.loss_type, + log_scale_min=config.model.log_scale_min) layer_tools.summary(model) checkpoint.load_parameters(model, checkpoint_path=checkpoint_path) return model diff --git a/parakeet/modules/audio.py b/parakeet/modules/audio.py index ebcc6c6..03e42b0 100644 --- a/parakeet/modules/audio.py +++ b/parakeet/modules/audio.py @@ -1,8 +1,22 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle from paddle import nn from paddle.nn import functional as F from scipy import signal -import numpy as np +import numpy as np __all__ = ["quantize", "dequantize", "STFT"] @@ -86,6 +100,7 @@ class STFT(nn.Layer): Ony ``center`` and ``reflect`` padding is supported now. """ + def __init__(self, n_fft, hop_length, win_length, window="hanning"): super(STFT, self).__init__() self.hop_length = hop_length @@ -109,7 +124,8 @@ class STFT(nn.Layer): (self.n_bin, 1, 1, self.n_fft)) w = np.concatenate([w_real, w_imag], axis=0) - self.weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype()) + self.weight = paddle.cast( + paddle.to_tensor(w), paddle.get_default_dtype()) def forward(self, x): """Compute the stft transform. diff --git a/parakeet/modules/conv.py b/parakeet/modules/conv.py index b57abf2..d984605 100644 --- a/parakeet/modules/conv.py +++ b/parakeet/modules/conv.py @@ -20,6 +20,7 @@ __all__ = [ "Conv1dBatchNorm", ] + class Conv1dCell(nn.Conv1D): """A subclass of Conv1D layer, which can be used in an autoregressive decoder like an RNN cell. @@ -231,6 +232,7 @@ class Conv1dBatchNorm(nn.Layer): epsilon : [type], optional The epsilon of the BatchNorm1D layer, by default 1e-05 """ + def __init__(self, in_channels, out_channels, diff --git a/parakeet/modules/geometry.py b/parakeet/modules/geometry.py index ec96daf..05a5931 100644 --- a/parakeet/modules/geometry.py +++ b/parakeet/modules/geometry.py @@ -1,6 +1,21 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np import paddle + def shuffle_dim(x, axis, perm=None): """Permute input tensor along aixs given the permutation or randomly. @@ -32,7 +47,7 @@ def shuffle_dim(x, axis, perm=None): perm = np.array(perm) else: perm = np.random.permutation(size) - + perm = paddle.to_tensor(perm) out = paddle.gather(x, perm, axis) return out diff --git a/parakeet/modules/losses.py b/parakeet/modules/losses.py index 3e22480..ab188fd 100644 --- a/parakeet/modules/losses.py +++ b/parakeet/modules/losses.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numba import numpy as np import paddle @@ -5,12 +19,13 @@ from paddle import nn from paddle.nn import functional as F __all__ = [ - "weighted_mean", - "masked_l1_loss", - "masked_softmax_with_cross_entropy", + "weighted_mean", + "masked_l1_loss", + "masked_softmax_with_cross_entropy", "diagonal_loss", ] + def weighted_mean(input, weight): """Weighted mean. It can also be used as masked mean. @@ -88,12 +103,11 @@ def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1): return loss -def diagonal_loss( - attentions, - input_lengths, - target_lengths, - g=0.2, - multihead=False): +def diagonal_loss(attentions, + input_lengths, + target_lengths, + g=0.2, + multihead=False): """A metric to evaluate how diagonal a attention distribution is. It is computed for batch attention distributions. For each attention @@ -133,6 +147,7 @@ def diagonal_loss( else: return paddle.mean(attentions * paddle.unsqueeze(W_tensor, 1)) + @numba.jit(nopython=True) def guided_attention(N, max_N, T, max_T, g): W = np.zeros((max_T, max_N), dtype=np.float32) @@ -142,6 +157,7 @@ def guided_attention(N, max_N, T, max_T, g): # (T_dec, T_enc) return W + def guided_attentions(input_lengths, target_lengths, g=0.2): B = len(input_lengths) max_input_len = input_lengths.max() @@ -151,4 +167,4 @@ def guided_attentions(input_lengths, target_lengths, g=0.2): W[b] = guided_attention(input_lengths[b], max_input_len, target_lengths[b], max_target_len, g) # (B, T_dec, T_enc) - return W \ No newline at end of file + return W diff --git a/parakeet/modules/masking.py b/parakeet/modules/masking.py index c54a5b1..96871a9 100644 --- a/parakeet/modules/masking.py +++ b/parakeet/modules/masking.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle from paddle.fluid.layers import sequence_mask @@ -8,6 +22,7 @@ __all__ = [ "future_mask", ] + def id_mask(input, padding_index=0, dtype="bool"): """Generate mask with input ids. diff --git a/parakeet/modules/positional_encoding.py b/parakeet/modules/positional_encoding.py index 084ccf3..07a86c9 100644 --- a/parakeet/modules/positional_encoding.py +++ b/parakeet/modules/positional_encoding.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math import numpy as np import paddle @@ -5,6 +19,7 @@ from paddle.nn import functional as F __all__ = ["positional_encoding"] + def positional_encoding(start_index, length, size, dtype=None): r"""Generate standard positional encoding matrix. @@ -37,7 +52,7 @@ def positional_encoding(start_index, length, size, dtype=None): dtype = dtype or paddle.get_default_dtype() channel = np.arange(0, size, 2) index = np.arange(start_index, start_index + length, 1) - p = np.expand_dims(index, -1) / (10000 ** (channel / float(size))) + p = np.expand_dims(index, -1) / (10000**(channel / float(size))) encodings = np.zeros([length, size]) encodings[:, 0::2] = np.sin(p) encodings[:, 1::2] = np.cos(p) diff --git a/parakeet/modules/transformer.py b/parakeet/modules/transformer.py index 18a7523..e857990 100644 --- a/parakeet/modules/transformer.py +++ b/parakeet/modules/transformer.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math import paddle from paddle import nn @@ -12,6 +26,7 @@ __all__ = [ "TransformerDecoderLayer", ] + class PositionwiseFFN(nn.Layer): """A faithful implementation of Position-wise Feed-Forward Network in `Attention is All You Need `_. @@ -30,10 +45,8 @@ class PositionwiseFFN(nn.Layer): The probability of the Dropout applied to the output of the first layer, by default 0. """ - def __init__(self, - input_size: int, - hidden_size: int, - dropout=0.0): + + def __init__(self, input_size: int, hidden_size: int, dropout=0.0): super(PositionwiseFFN, self).__init__() self.linear1 = nn.Linear(input_size, hidden_size) self.linear2 = nn.Linear(hidden_size, input_size) @@ -86,16 +99,17 @@ class TransformerEncoderLayer(nn.Layer): ------ It uses the PostLN (post layer norm) scheme. """ + def __init__(self, d_model, n_heads, d_ffn, dropout=0.): super(TransformerEncoderLayer, self).__init__() self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout) self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6) - + self.ffn = PositionwiseFFN(d_model, d_ffn, dropout) self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6) - + self.dropout = dropout - + def forward(self, x, mask): """Forward pass of TransformerEncoderLayer. @@ -118,14 +132,12 @@ class TransformerEncoderLayer(nn.Layer): """ context_vector, attn_weights = self.self_mha(x, x, x, mask) x = self.layer_norm1( - F.dropout(x + context_vector, - self.dropout, - training=self.training)) - + F.dropout( + x + context_vector, self.dropout, training=self.training)) + x = self.layer_norm2( - F.dropout(x + self.ffn(x), - self.dropout, - training=self.training)) + F.dropout( + x + self.ffn(x), self.dropout, training=self.training)) return x, attn_weights @@ -155,19 +167,20 @@ class TransformerDecoderLayer(nn.Layer): ------ It uses the PostLN (post layer norm) scheme. """ + def __init__(self, d_model, n_heads, d_ffn, dropout=0.): super(TransformerDecoderLayer, self).__init__() self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout) self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6) - + self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout) self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6) - + self.ffn = PositionwiseFFN(d_model, d_ffn, dropout) self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6) - + self.dropout = dropout - + def forward(self, q, k, v, encoder_mask, decoder_mask): """Forward pass of TransformerEncoderLayer. @@ -197,20 +210,19 @@ class TransformerDecoderLayer(nn.Layer): cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)] Decoder-encoder cross attention. """ - context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask) + context_vector, self_attn_weights = self.self_mha(q, q, q, + decoder_mask) q = self.layer_norm1( - F.dropout(q + context_vector, - self.dropout, - training=self.training)) - - context_vector, cross_attn_weights = self.cross_mha(q, k, v, encoder_mask) + F.dropout( + q + context_vector, self.dropout, training=self.training)) + + context_vector, cross_attn_weights = self.cross_mha(q, k, v, + encoder_mask) q = self.layer_norm2( - F.dropout(q + context_vector, - self.dropout, - training=self.training)) - + F.dropout( + q + context_vector, self.dropout, training=self.training)) + q = self.layer_norm3( - F.dropout(q + self.ffn(q), - self.dropout, - training=self.training)) + F.dropout( + q + self.ffn(q), self.dropout, training=self.training)) return q, self_attn_weights, cross_attn_weights diff --git a/parakeet/training/__init__.py b/parakeet/training/__init__.py index cb1c59b..aec401c 100644 --- a/parakeet/training/__init__.py +++ b/parakeet/training/__init__.py @@ -1,2 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from parakeet.training.cli import * from parakeet.training.experiment import * diff --git a/parakeet/training/cli.py b/parakeet/training/cli.py index e6b6fe5..a3cfbda 100644 --- a/parakeet/training/cli.py +++ b/parakeet/training/cli.py @@ -1,5 +1,20 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse + def default_argument_parser(): r"""A simple yet genral argument parser for experiments with parakeet. @@ -46,5 +61,5 @@ def default_argument_parser(): # overwrite extra config and default config parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") # yapd: enable - + return parser diff --git a/parakeet/training/default_config.py b/parakeet/training/default_config.py index f4b9c29..583f6e6 100644 --- a/parakeet/training/default_config.py +++ b/parakeet/training/default_config.py @@ -1,12 +1,26 @@ -from yacs.config import CfgNode +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from yacs.config import CfgNode _C = CfgNode( dict( - valid_interval=1000, # validation - save_interval=10000, # checkpoint - max_iteration=900000, # max iteration to train - ) -) + valid_interval=1000, # validation + save_interval=10000, # checkpoint + max_iteration=900000, # max iteration to train + )) + def get_default_training_config(): return _C.clone() diff --git a/parakeet/training/experiment.py b/parakeet/training/experiment.py index 1bf0af6..16da93d 100644 --- a/parakeet/training/experiment.py +++ b/parakeet/training/experiment.py @@ -27,6 +27,7 @@ from parakeet.utils import checkpoint, mp_tools __all__ = ["ExperimentBase"] + class ExperimentBase(object): """ An experiment template in order to structure the training code and take diff --git a/parakeet/utils/checkpoint.py b/parakeet/utils/checkpoint.py index ec6f282..0d2a2e2 100644 --- a/parakeet/utils/checkpoint.py +++ b/parakeet/utils/checkpoint.py @@ -45,6 +45,7 @@ def _load_latest_checkpoint(checkpoint_dir: str) -> int: return iteration + def _save_checkpoint(checkpoint_dir: str, iteration: int): """Save the iteration number of the latest model to be checkpointed. @@ -60,6 +61,7 @@ def _save_checkpoint(checkpoint_dir: str, iteration: int): with open(checkpoint_record, "wt") as handle: handle.write("model_checkpoint_path: step-{}".format(iteration)) + def load_parameters(model, optimizer=None, checkpoint_dir=None, @@ -97,18 +99,19 @@ def load_parameters(model, params_path = checkpoint_path + ".pdparams" model_dict = paddle.load(params_path) model.set_state_dict(model_dict) - print("[checkpoint] Rank {}: loaded model from {}".format( - local_rank, params_path)) - + print("[checkpoint] Rank {}: loaded model from {}".format(local_rank, + params_path)) + optimizer_path = checkpoint_path + ".pdopt" if optimizer and os.path.isfile(optimizer_path): optimizer_dict = paddle.load(optimizer_path) optimizer.set_state_dict(optimizer_dict) - print("[checkpoint] Rank {}: loaded optimizer state from {}". - format(local_rank, optimizer_path)) + print("[checkpoint] Rank {}: loaded optimizer state from {}".format( + local_rank, optimizer_path)) return iteration + @mp_tools.rank_zero_only def save_parameters(checkpoint_dir, iteration, model, optimizer=None): """Checkpoint the latest trained model parameters. @@ -124,7 +127,7 @@ def save_parameters(checkpoint_dir, iteration, model, optimizer=None): None """ checkpoint_path = os.path.join(checkpoint_dir, "step-{}".format(iteration)) - + model_dict = model.state_dict() params_path = checkpoint_path + ".pdparams" paddle.save(model_dict, params_path) diff --git a/parakeet/utils/internals.py b/parakeet/utils/internals.py index c72a9b0..968a604 100644 --- a/parakeet/utils/internals.py +++ b/parakeet/utils/internals.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np from paddle.framework import core diff --git a/parakeet/utils/layer_tools.py b/parakeet/utils/layer_tools.py index 2268377..fcda44f 100644 --- a/parakeet/utils/layer_tools.py +++ b/parakeet/utils/layer_tools.py @@ -28,6 +28,7 @@ def summary(layer: nn.Layer): print("layer has {} parameters, {} elements.".format(num_params, num_elements)) + def gradient_norm(layer: nn.Layer): grad_norm_dict = {} for name, param in layer.state_dict().items(): @@ -36,6 +37,7 @@ def gradient_norm(layer: nn.Layer): grad_norm_dict[name] = np.linalg.norm(grad) / grad.size return grad_norm_dict + def recursively_remove_weight_norm(layer: nn.Layer): for layer in layer.sublayers(): try: @@ -44,10 +46,12 @@ def recursively_remove_weight_norm(layer: nn.Layer): # ther is not weight norm hoom in this layer pass + def freeze(layer: nn.Layer): for param in layer.parameters(): param.trainable = False + def unfreeze(layer: nn.Layer): for param in layer.parameters(): param.trainable = True diff --git a/parakeet/utils/mp_tools.py b/parakeet/utils/mp_tools.py index 0b9c6dc..a4bc97a 100644 --- a/parakeet/utils/mp_tools.py +++ b/parakeet/utils/mp_tools.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle from paddle import distributed as dist from functools import wraps @@ -11,11 +25,8 @@ def rank_zero_only(func): @wraps(func) def wrapper(*args, **kwargs): if local_rank != 0: - return + return result = func(*args, **kwargs) return result - + return wrapper - - - diff --git a/parakeet/utils/scheduler.py b/parakeet/utils/scheduler.py index 97e98ec..4d41aca 100644 --- a/parakeet/utils/scheduler.py +++ b/parakeet/utils/scheduler.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math __all__ = ["SchedulerBase", "Constant", "PieceWise", "StepWise"] @@ -24,7 +38,7 @@ class PieceWise(SchedulerBase): self.xs = [item[0] for item in anchors] self.ys = [item[1] for item in anchors] self.num_anchors = len(self.xs) - + def __call__(self, step): i = 0 for x in self.xs: @@ -34,8 +48,8 @@ class PieceWise(SchedulerBase): return self.ys[0] if i == self.num_anchors: return self.ys[-1] - k = (self.ys[i] - self.ys[i-1]) / (self.xs[i] - self.xs[i-1]) - out = self.ys[i-1] + (step - self.xs[i-1]) * k + k = (self.ys[i] - self.ys[i - 1]) / (self.xs[i] - self.xs[i - 1]) + out = self.ys[i - 1] + (step - self.xs[i - 1]) * k return out @@ -47,7 +61,7 @@ class StepWise(SchedulerBase): self.xs = [item[0] for item in anchors] self.ys = [item[1] for item in anchors] self.num_anchors = len(self.xs) - + def __call__(self, step): i = 0 for x in self.xs: @@ -58,5 +72,4 @@ class StepWise(SchedulerBase): return self.ys[-1] if i == 0: return self.ys[0] - return self.ys[i-1] - + return self.ys[i - 1] diff --git a/setup.py b/setup.py index ee5f215..0fa9eb7 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,6 @@ setup_info = dict( description='Speech synthesis tools and models based on Paddlepaddle', long_description=long_description, license='Apache 2', - python_requires='>=3.6', install_requires=[ 'numpy', @@ -71,23 +70,18 @@ setup_info = dict( 'yacs', 'tensorboardX', ], - extras_require={ - 'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], - }, + extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], }, # Package info packages=find_packages(exclude=('tests', 'tests.*')), - zip_safe=True, - - classifiers = [ + zip_safe=True, + classifiers=[ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', 'Topic :: Scientific/Engineering :: Artificial Intelligence' 'License :: OSI Approved :: Apache2 License', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', - ], - - ) + ], ) setup(**setup_info) From bb64e4659a8611b82cbe0413c8c26bcb1b7f1d8e Mon Sep 17 00:00:00 2001 From: iclementine Date: Sun, 20 Dec 2020 13:46:45 +0800 Subject: [PATCH 3/3] discard opencc untill we find an easy solution to install it on windows --- parakeet/frontend/phonectic.py | 8 +++++--- setup.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/parakeet/frontend/phonectic.py b/parakeet/frontend/phonectic.py index 2b1e2ae..4620b33 100644 --- a/parakeet/frontend/phonectic.py +++ b/parakeet/frontend/phonectic.py @@ -17,7 +17,8 @@ from typing import Union from g2p_en import G2p from g2pM import G2pM from parakeet.frontend import Vocab -from opencc import OpenCC +# discard opencc untill we find an easy solution to install it on windows +# from opencc import OpenCC from parakeet.frontend.punctuation import get_punctuations from parakeet.frontend.normalizer.normalizer import normalize @@ -211,7 +212,7 @@ class Chinese(Phonetics): """ def __init__(self): - self.opencc_backend = OpenCC('t2s.json') + # self.opencc_backend = OpenCC('t2s.json') self.backend = G2pM() self.phonemes = self._get_all_syllables() self.punctuations = get_punctuations("cn") @@ -236,7 +237,8 @@ class Chinese(Phonetics): List[str] The list of pronunciation sequence. """ - simplified = self.opencc_backend.convert(sentence) + # simplified = self.opencc_backend.convert(sentence) + simplified = sentence phonemes = self.backend(simplified) start = self.vocab.start_symbol end = self.vocab.end_symbol diff --git a/setup.py b/setup.py index 0fa9eb7..bf8f266 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,7 @@ setup_info = dict( 'scipy', 'pandas', 'sox', - 'opencc', + # 'opencc', 'soundfile', 'g2p_en', 'g2pM',