From ef1ea56ed67f777c6af76d3db8f68d2ee9f51f3d Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Fri, 7 May 2021 14:51:49 +0800 Subject: [PATCH] fix typos and docs --- examples/tacotron2/synthesize.ipynb | 52 ++++---- examples/tacotron2_aishell3/aishell3.py | 2 +- examples/tacotron2_aishell3/train.py | 4 +- .../tacotron2_aishell3/voice_cloning.ipynb | 117 ++++++++++-------- parakeet/models/tacotron2.py | 6 +- 5 files changed, 100 insertions(+), 81 deletions(-) diff --git a/examples/tacotron2/synthesize.ipynb b/examples/tacotron2/synthesize.ipynb index 2ede277..f71182f 100644 --- a/examples/tacotron2/synthesize.ipynb +++ b/examples/tacotron2/synthesize.ipynb @@ -21,10 +21,10 @@ "\n", "from parakeet.utils import display\n", "from parakeet.utils import layer_tools\n", - "paddle.set_device(\"gpu:5\")\n", + "paddle.set_device(\"gpu:0\")\n", "\n", "import sys\n", - "sys.path.append(\"/home/chenfeiyu/project/Parakeet_0.2\")\n", + "sys.path.append(\"../..\")\n", "import examples" ] }, @@ -114,34 +114,34 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[checkpoint] Rank 0: loaded model from runs/refactor/checkpoints/step-50000.pdparams\n" + "[checkpoint] Rank 0: loaded model from ../../pretrained/tacotron2/tacotron2_ljspeech_ckpt_0.3_alternative/step-50000.pdparams\n" ] } ], "source": [ "frontend = EnglishCharacter()\n", "model = Tacotron2.from_pretrained(\n", - " synthesizer_config, \"runs/refactor/checkpoints/step-50000\")\n", + " synthesizer_config, \"../../pretrained/tacotron2/tacotron2_ljspeech_ckpt_0.3_alternative/step-50000\")\n", "model.eval()" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - " 36%|███▋ | 363/1000 [00:01<00:02, 266.51it/s]" + " 36%|███▋ | 365/1000 [00:01<00:02, 256.89it/s]\n" ] }, { @@ -150,13 +150,6 @@ "text": [ "content exhausted!\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] } ], "source": [ @@ -171,12 +164,12 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -201,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -210,7 +203,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -252,35 +245,35 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[checkpoint] Rank 0: loaded model from /home/chenfeiyu/projects/Parakeet_0.2/examples/waveflow/pretrained/waveflow_ljspeech_ckpt_0.2/step-2000000.pdparams\n" + "[checkpoint] Rank 0: loaded model from ../../pretrained/waveflow/waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams\n" ] } ], "source": [ "vocoder = ConditionalWaveFlow.from_pretrained(\n", " vocoder_config, \n", - " \"/home/chenfeiyu/projects/Parakeet_0.2/examples/waveflow/pretrained/waveflow_ljspeech_ckpt_0.2/step-2000000\")\n", + " \"../../pretrained/waveflow/waveflow_ljspeech_ckpt_0.3/step-2000000\")\n", "layer_tools.recursively_remove_weight_norm(vocoder)\n", "vocoder.eval()" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "time: 9.420342922210693s\n" + "time: 9.412613868713379s\n" ] } ], @@ -291,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -299,7 +292,7 @@ "text/html": [ "\n", " \n", " " @@ -308,7 +301,7 @@ "" ] }, - "execution_count": 11, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -316,6 +309,13 @@ "source": [ "ipd.Audio(wav, rate=22050)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/examples/tacotron2_aishell3/aishell3.py b/examples/tacotron2_aishell3/aishell3.py index e9a3126..ab8977a 100644 --- a/examples/tacotron2_aishell3/aishell3.py +++ b/examples/tacotron2_aishell3/aishell3.py @@ -11,7 +11,7 @@ from preprocess_transcription import _phones, _tones voc_phones = Vocab(sorted(list(_phones))) print("vocab_phones:\n", voc_phones) voc_tones = Vocab(sorted(list(_tones))) -print("vocab+tones:\n", voc_tones) +print("vocab_tones:\n", voc_tones) class AiShell3(Dataset): diff --git a/examples/tacotron2_aishell3/train.py b/examples/tacotron2_aishell3/train.py index 4ec0a0a..54a33cf 100644 --- a/examples/tacotron2_aishell3/train.py +++ b/examples/tacotron2_aishell3/train.py @@ -183,8 +183,8 @@ class Experiment(ExperimentBase): config.training.weight_decay), grad_clip=grad_clip) criterion = Tacotron2Loss( - use_stop_token_loss=True, - use_guided_attention_loss=False, + use_stop_token_loss=config.model.use_stop_token, + use_guided_attention_loss=config.model.use_guided_attention_loss, sigma=config.model.guided_attention_loss_sigma) self.model = model self.optimizer = optimizer diff --git a/examples/tacotron2_aishell3/voice_cloning.ipynb b/examples/tacotron2_aishell3/voice_cloning.ipynb index 65e9d43..a6b62bc 100644 --- a/examples/tacotron2_aishell3/voice_cloning.ipynb +++ b/examples/tacotron2_aishell3/voice_cloning.ipynb @@ -13,14 +13,14 @@ "import soundfile as sf\n", "import librosa.display\n", "from parakeet.utils import display\n", - "paddle.set_device(\"gpu:5\")\n", + "paddle.set_device(\"gpu:0\")\n", "import sys\n", - "sys.path.append(\"/home/chenfeiyu/projects/Parakeet_0.2/\")" + "sys.path.append(\"../../\")" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -36,9 +36,24 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "vocab_phones:\n", + " Vocab(size: 68,\n", + "stoi:\n", + "OrderedDict([('', 0), ('', 1), ('', 2), ('', 3), ('$', 4), ('%', 5), ('&r', 6), ('a', 7), ('ai', 8), ('an', 9), ('ang', 10), ('ao', 11), ('b', 12), ('c', 13), ('ch', 14), ('d', 15), ('e', 16), ('ea', 17), ('ei', 18), ('en', 19), ('eng', 20), ('er', 21), ('f', 22), ('g', 23), ('h', 24), ('i', 25), ('ia', 26), ('iai', 27), ('ian', 28), ('iang', 29), ('iao', 30), ('ie', 31), ('ien', 32), ('ieng', 33), ('ii', 34), ('iii', 35), ('io', 36), ('iou', 37), ('j', 38), ('k', 39), ('l', 40), ('m', 41), ('n', 42), ('o', 43), ('ou', 44), ('p', 45), ('q', 46), ('r', 47), ('s', 48), ('sh', 49), ('t', 50), ('u', 51), ('ua', 52), ('uai', 53), ('uan', 54), ('uang', 55), ('uei', 56), ('uen', 57), ('ueng', 58), ('uo', 59), ('v', 60), ('van', 61), ('ve', 62), ('ven', 63), ('veng', 64), ('x', 65), ('z', 66), ('zh', 67)]))\n", + "vocab+tones:\n", + " Vocab(size: 10,\n", + "stoi:\n", + "OrderedDict([('', 0), ('', 1), ('', 2), ('', 3), ('0', 4), ('1', 5), ('2', 6), ('3', 7), ('4', 8), ('5', 9)]))\n" + ] + } + ], "source": [ "from examples.ge2e.audio_processor import SpeakerVerificationPreprocessor\n", "from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder\n", @@ -57,7 +72,7 @@ " min_pad_coverage=0.75, \n", " partial_overlap_ratio=0.5)\n", "speaker_encoder = LSTMSpeakerEncoder(n_mels=40, num_layers=3, hidden_size=256, output_size=256)\n", - "speaker_encoder_params_path = \"/home/chenfeiyu/projects/Parakeet_0.2/examples/ge2e/runs/cn/checkpoints/step-3000000.pdparams\"\n", + "speaker_encoder_params_path = \"../../pretrained/ge2e/ge2e_ckpt_0.3/step-3000000.pdparams\"\n", "speaker_encoder.set_state_dict(paddle.load(speaker_encoder_params_path))\n", "speaker_encoder.eval()\n", "\n", @@ -66,9 +81,8 @@ "from examples.tacotron2_aishell3.chinese_g2p import convert_sentence\n", "from examples.tacotron2_aishell3.aishell3 import voc_phones, voc_tones\n", "\n", - "from yacs.config import CfgNode\n", "synthesizer = Tacotron2(\n", - " vocab_size=70,\n", + " vocab_size=68,\n", " n_tones=10,\n", " d_mels= 80,\n", " d_encoder= 512,\n", @@ -92,14 +106,14 @@ " d_global_condition=256,\n", " use_stop_token=False,\n", ")\n", - "params_path = \"/home/chenfeiyu/projects/Parakeet_0.2/examples/tacotron2_aishell3/runs/debug/checkpoints/step-55000.pdparams\"\n", + "params_path = \"../../pretrained/tacotron2_aishell3/tacotron2_aishell3_ckpt_0.3/step-450000.pdparams\"\n", "synthesizer.set_state_dict(paddle.load(params_path))\n", "synthesizer.eval()\n", "\n", "# vocoder\n", "from parakeet.models import ConditionalWaveFlow\n", "vocoder = ConditionalWaveFlow(upsample_factors=[16, 16], n_flows=8, n_layers=8, n_group=16, channels=128, n_mels=80, kernel_size=[3, 3])\n", - "params_path = \"/home/chenfeiyu/projects/parakeet_examples/waveflow/step-2000000.pdparams\"\n", + "params_path = \"../../pretrained/waveflow/waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams\"\n", "vocoder.set_state_dict(paddle.load(params_path))\n", "vocoder.eval()" ] @@ -111,9 +125,16 @@ "## 生成 speaker encoding" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "首先在当前文件夹下新建文件夹 `ref_audio`,把要作为参考的音频存在在这个文件夹中。格式要求是 wav 格式,采样率会被重采样至 16kHz." + ] + }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -130,13 +151,12 @@ "" ] }, - "execution_count": 18, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# ref_audio_path = \"/home/chenfeiyu/datasets/aishell3/train/wav/SSB0011/SSB00110001.wav\"\n", "ref_name = \"女声2.wav\"\n", "ref_audio_path = f\"./ref_audio/{ref_name}\"\n", "ipd.Audio(ref_audio_path, normalize=True)" @@ -144,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -171,22 +191,31 @@ "## 合成频谱" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "因为 AISHELL-3 数据集中使用 `%` 和 `$` 表示韵律词和韵律短语的边界,它们大约对应着较短和较长的停顿,在文本中可以使用 `%` 和 `$` 来调节韵律。\n", + "\n", + "值得的注意的是,句子的有效字符集仅包含汉字和 `%`, `$`, 因此输入的句子只能包含这些字符。" + ] + }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['v', 'ien', 'd', 'e', 'b', 'iao', 'x', 'ian', 'x', 'ieng', 'sh', 'iii', '%', 'z', 'ai', 'uei', 'l', 'ai', '%', 'j', 'iang', 'b', 'ian', 'd', 'e', 've', 'l', 'ai', 've', 'zh', 'ueng', 'iao', '$']\n", - "['3', '1', '0', '5', '0', '3', '0', '4', '0', '2', '0', '4', '0', '0', '4', '4', '0', '2', '0', '0', '1', '0', '4', '0', '2', '4', '0', '2', '4', '0', '4', '4', '0']\n" + "['m', 'ei', 'd', 'ang', 'n', 'i', 'j', 've', 'd', 'e', '%', 'x', 'iang', 'iao', 'p', 'i', 'p', 'ieng', 'sh', 'en', 'm', 'e', 'r', 'en', 'd', 'e', 'sh', 'iii', 'h', 'ou', '$', 'n', 'i', 'q', 'ie', 'iao', 'j', 'i', 'zh', 'e', '%', 'zh', 'e', 'g', 'e', 'sh', 'iii', 'j', 'ie', 'sh', 'ang', 'd', 'e', 'r', 'en', '%', 'b', 'ieng', 'f', 'ei', 'd', 'ou', 'j', 'v', 'b', 'ei', 'n', 'i', 'b', 'ieng', 'iou', 'd', 'e', 't', 'iao', 'j', 'ian', '$']\n", + "['0', '3', '0', '1', '0', '3', '0', '2', '0', '5', '0', '0', '3', '4', '0', '1', '0', '2', '0', '2', '0', '5', '0', '2', '0', '5', '0', '2', '0', '4', '0', '0', '3', '0', '4', '4', '0', '4', '0', '5', '0', '0', '4', '0', '4', '0', '4', '0', '4', '0', '4', '0', '5', '0', '2', '0', '0', '4', '0', '1', '0', '1', '0', '4', '0', '4', '0', '3', '0', '3', '3', '0', '5', '0', '2', '0', '4', '0']\n" ] } ], "source": [ - "sentence = \"语音的表现形式%在未来%将变得越来越重要$\"\n", + "sentence = \"每当你觉得%想要批评什么人的时候$你切要记着%这个世界上的人%并非都具备你禀有的条件$\"\n", "phones, tones = convert_sentence(sentence)\n", "print(phones)\n", "print(tones)\n", @@ -201,14 +230,14 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - " 35%|███▍ | 349/1000 [00:01<00:02, 233.91it/s]" + " 74%|███████▍ | 741/1000 [00:02<00:01, 249.49it/s]\n" ] }, { @@ -218,28 +247,11 @@ "content exhausted!\n" ] }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, { "data": { + "image/png": "\n", "text/plain": [ - "" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAA+CAYAAAAyPECXAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAATWUlEQVR4nO2de4xcV33HP79z57lvr9e7fm0Sx3ZeKpFj8jANBVQeDUmrgMQfQaIgETVVWySQ2koB1Cq0qtQiyqtCoKSkDZRCIKQ0QqYlUMqrwTgxduLEcbKOnazX6/Wu9/2amXvPr3+cO7Oz49n1etfe2YnPRxrNnXvv3PneM7vfOfd3fud3RVXxeDweT/1hai3A4/F4PMvDG7jH4/HUKd7APR6Pp07xBu7xeDx1ijdwj8fjqVMSq/lhKUlrhsbV/EiPx+O5eIiAqntGgfi5MplPQIzrH6vV0jKAppMUmgOSrXlSJgIgEIsiWBUKNiAVhGRNgZSECErP4dkhVd1QKWdVDTxDI7fJ21fzIz0eT71SbpaV6c4iFa9jg1R7/uMWjyUCYjCZNJrPzx0qlUKam5BsBowhv2UdU1vSnH2DEKUV7cqxc8sZHrjqCZIS0SAhAM3G0macpRbUcioSMhLRbgzDdk7Xj6d38pmD72Td3gaywyFqhCBnsYGggZBvMaRHIzKDs5jxGbCWHj79arVTWVUD93g8lzkiSBCgUQRikGSi9FoLIdiotN+850rKDV0Ek0rGqxUpe49ks86cjUFSSaShAc3nkWwGzaYZv6Gdzo++whev+iGNYohQxqwyalOcjRopEPDLiWv42cAOwiOdJKaFzIEsU49v5W9+/i40l4coAhE0l0PD8NzzrDLXZhuHFmyiTPEUgeg8zekN3OPxXHrKTFWjCEmlwKoz7SiCIECSCYjELScSznQTCSSTRq11xhwEkEy44xVCSARMXd/F6TclyG0usGPbANe3nqYpyHFjQy/twSQbExMU1FDQgFHbQLOZYcJm6Sus4+u9e5j4q63c+/O3uh+Pos6KH4imVB/XyClKEx+jiMgqYiqvBOauFhYy74vJeQ1cRLqBrwFduB+FB1X1CyLyAPBHwGC86ydUde+lEurxeOqAUs/ZhSaKPVFMbMzJJCSTaDYNcVhBM2kwEDWlEauoCBPbsuRahNE9ee6/7QfcmO4lT8CsTTKtaaZsmvXBJHkN+PbgrZx86jqajqaY+a/NHDm9niAXcejMldjhUdfjFkHDELVaCrNIEJBNnUFSI0TF0Espvl2GqutdVzndBSM2qzTDfSk98BD4c1U9ICLNwDMi8mS87XOq+plLJ8/j8axZymLUJp12vepsBsmkiTa0MnBLM9ObIMqAhBBdMQsKdiZBywtJ8m0wuzUPoYGUZUPXGFe1DpOPEpw4tZHCdJL1v0jxvY/t5LujnYsIGWU7v5q3RnHGVf66Eg1DF/KYrtxQP+VFzmvgqtoP9MfLEyJyBNhyqYV5PJ5LQDxwV4o1l60zjQ1IOgWRdWGLxqwLWQQGTbhBQokUDQQpRNiGFGY6j21MM3BLM5NXQNuNQ+zpOsHJ6VlGDzWSPhuQGQIbQPsP0gQFCHKWkWshe1rp2i8kpkOCqQKJMzkmJxQosKNwHJvLARCVDTCWMMH8c7hMuaAYuIhcBdwE7ANuBz4iIh8Ensb10keqvOc+4D6ADA0rlOvxXMZUi88utJ8Yd30f95AlkcS0NqMzs5iOduy6ZlQEAsEmA2wmYHhbmsluQUKY2RLSsnmCd3Qf5ersIDnrBgmTEhEhtAXTvJrrYGtqmP5CG48c3oM5niX7lXX0HAmR2TzXcxKdmnY6xLhQRhAgLc007wvRyDoTTiQQY4iGhtGwMHcei53nUrJNLgNkqdUIRaQJ+Cnwd6r6uIh0AUO4q5O/BTap6ocXO0aLtKtPI/R4loEJXLpbFKFxzxSY6z2nkqVMC7ttK7YhOe/thaYEI9emaOsp0PfWBB+446cAjIcZRgsN9M+08OLz3TS+GpBrV9pehPWHxpATfUTjk/N+DOal4dVRuKGe+ZE+9oyq3ly5fkk9cBFJAt8FvqGqjwOo6kDZ9oeA718krR5PfVPZA662vXKwbCEjLBp0NgPbugk7GrApF86wgYABNUK+2WATQr5FyNw1wE0dxzAoRixWDSem2jlxpJuWu4Zo2LuJp97Y4CaYBAGSUiQ5xfWtpyCy2OERpCGLpNPY2dz8UEW5Tm/eNWcpWSgCfBU4oqqfLVu/KY6PA7wXOHxpJHo8dUScdSGJBBpnL1RuQwymrdXFmRsyhB1N5Nal0MCZcSUaQL7RMHSLZdOOQa5sHsGIkrcB+ShBqIbe0TbCyDA9kuW6j6c59myhIs59mmu0H0TIymtovE1thBbiGPPo2Nz+05Uje561yFJ64LcDfwg8JyIH43WfAN4vIrtwIZQTwB9fEoUez2qw1Phycd/yMAK4SSlGkHQa09FO1NnG1NYGwux8Q9a4x5xrEwqNYNMwe0WezZuHSBpLOggRmdMQqWGmkGR0pJmtj6Zp+tQEZwfPGWpiI/2l5arR4ZJe4wf/XkcsJQvlF7gJ/5X4nG9PfVMewogNWKP55mayWTeZpLEBbWsmv7GZ2fVJcs0GUWfIKIiCRIBAvkmY3qTojmnaWuZ6sqpCPgwohAG53iZSI4bsgNL9ZEhwcMCltVXkGxugEdgW996jMGRFePN+XeFnYnouP6pNz1aLWkOiq5NoUweFdRnyrQlm2wz5NiHMQNighM2KbQ5JZHNYK4iALRg0FGQ2IDVsaD6hdP84R+rzvUTDI9V79SZwPfZiqCWK3CSThfSqzk3T9nhivIF71iYXkuGwUPijOJgIpVQ2MeKKFSUSSEc709dsQAVm1wdMbTTYFEQZJcyCJhWTd8dNTgoNA0LrUyENx8fQ1065w+YLYAQRd1wAaciiYUg0MrbwOahFI+ZmBi6aMuezPjzV8QbuqQ1LMaMqBgzM1Z8o1syIIjSyBFs2MvDOLUxtirMzBNTEdSkCSpU/iyFmDdwEE7esYCypYUNTLzQMRqSH8iQHJ6H/DJrPl6Zil6ZdF/UQQBBrymZcj3p6Zkm5yhIEaOhzmj3Lwxu4pzYsElYoGrNpacZuaCPf0cjALWlmO60zYphXirl8WexcbWYp98Vy8y6ACYXUKKzrKYCF7KlJ6HmtpE3zeWfWxD8YYuZCHFXqZagqRBEShmC1VOfjfOdfygBZCr737anAG7hndTCu2pykUpg4dDHRnSS3TpjcHkLCui6zJe46g0SCFASJIJhRkmNxj7do2hqbtLjBxPQIbPrfs+jxXjcYGYcn1Oq5U8fLUa2euVHcXG2jKmiciheHQDSfd+l33mg9q4Q3cM+CSCKx9JKY8UxBaXBZGySTaFOWoVs7GNqt0JZHCy6uYaYCghkhmHVG3Xg84Yw4NuTiI8gpqQklNW5pOthHeOr0fDetomtJORYX22BV5z97PKuEN/DLncpYdFncWdJpJIpKRYUWmlUYtLYw/o7r6HuXxWRDJFBQwUaCGVQa+wyJlzNxr1pJTkN2MCQzNIsZmybqOXFuIX9cfBjcQF/oFrxJejxleANfi1S5xF/WMVRdL7pYeD42Zg0L1esei7i852wG3dJJ1JSm//ZGsm8bJJMIMWUTTKy6+/cpMJNPMvoqtD+dIDkVYEIXizYhNPZOYo71EY2NlwxZAlMqjG+jaMGp2qWcbG/aHk9VvIGvRS40fQ7Orb1RflcQ4nCtlG0zAdPvuZmTfxCyceMoTak8gVhmowSzYYKzYykKk0myJyD5yHoi1XPCE6JubLApVDpPTiFHX0Vnc3HYJY4LMxfW0JJRp+bts+J28HguU7yBrwXKix8V0+bKQwoVpUFLb4vrapTiwpIk6N7M1LUbGNyVJPPbQ9y4oR+DUlCDVXfsxkQOq4YfvRCy/v9SBCMd5AqKCRWJIGuVK/IWkyuQGDxL9Mpr587gq/jxULUstbJlqWSoN2iPZ0V4A681pUkoRYN2aWulsTpVwBLs2EbfXRuZ2JXjuiv76chM0p6aJojvO2JEsWo5Mp7nZH+BRE+C9L+10zvQ5D6mPDQRf+Z1Q1NI/3Hs5JS7LyGUUuWKudbRQoOY86rSXeD0bG/cHs9FwRv4xcLECcpLrTURp9VN/v4uxj4wwQd27CcpEbNx4fyMcb3UCCFAOTA+zIsHNtD66zTT39lC34xlYDIfp9SVzdSzlp1hDjM2jD0zhC1WlatSuvSCU+c8Hs+awhv4xaIs5BG0tpDftZ1j7w949+7n2JCaoDM5Ts4maQ2mKWhAQRNECF8+PMO6x1r46cHdi/dMCyHXT5zATk2jMzOl3Oaq9/oTwVYey/d6PZ7XHd7AKym7w4lk0tiZWbA6N+gGrqZGMkG05wZG/3KKT167l4wUiOKijb359XzvdBNNe6/kpW/ewCuTecxUDsmdm/2xIxrBDrxElMtVzwxZjvF6s/Z4LgsuXwOvWpGumKVhXWGidBqzuYvpba30/U6Cq2/p5d7unxPE/d6HTnaQ+aetPPT029zUaWvBGNcLD0O2ThxyPwBGsIBpaHA51bYsQ6Qy5OLN1+PxLJEVGbiI3AF8AQiAf1bVv78oqlaDOJVOjBB0rCd3/RZu//w+7mo5iFXDvw/v4dnhdjKJKfoHMySeTzL7uc386/7bSlO0jeRpDl8iHBtfeKq2aqnjHo2Pr+45ejye1zXLNnARCYAvAe8ETgL7ReQJVX3hYolbohAkkUTDAkFzs6sG19WOhBaZmkHTKWxzhkJrhjO703TecZIPd/+SnanTTNgMo1EjPbkunhqGfffexP5jne64mztJtWYwozNsD6dguA+dmUVNnOZXKLiqc4XwwnvR57tnosfj8SyBlfTAbwV6VPUVABH5FnA3cGEGXryHYDaLaWkm6lzH5PYmBm41dN44wBs7evmtxj62pwaYsFlGowbubDxOk7hsDYuloJakGAaikFGb4uDsFWRMgY2JUUajRk4V1nFsdgPff/ENTP3LZr7xm7fD2VHEGEglQQSdnIKJl4nycXW4sXGMEZdGV061mtMXSlkhJI/H41kuKzHwLUBv2euTwG2VO4nIfcB9AFdsSbB3/wFsWe6EM+CIURsyEKV4ZvYqfjh0A6cO7CD/7S6e62vnpTPXEIxOoqkkmkny6NHj2HxxMshi+W5XlC1HbOcgVJlRWBWNlpZK53vQHo+nRlzyQUxVfRB4EEBEJlKbXzm6+DteA34GwKuXWNsy6ACGai1iBXj9tcXrrx31rB3gymorV2LgfUB32eut8brFOKqqN6/gM2uKiDzt9dcOr7+21LP+eta+GGYF790P7BSRbSKSAu4Bnrg4sjwej8dzPpbdA1fVUEQ+Avw3Lo3wYVV9/qIp83g8Hs+irCgGrqp7gb0X8JYHV/J5awCvv7Z4/bWlnvXXs/YFkaWWAPV4PB7P2mIlMXCPx+Px1BBv4B6Px1OnrJqBi8gdInJURHpE5P7V+tyVICInROQ5ETkoIk/H69pF5EkReTl+XldrnUVE5GEROSMih8vWVdUrji/G38ezIrK7dspLWqvpf0BE+uLv4KCI3Fm27eOx/qMi8nu1UV3S0i0iPxGRF0TkeRH5aLy+Ltp/Ef310v4ZEfm1iByK9X8qXr9NRPbFOh+NM+YQkXT8uifeflUt9S8bVb3kD1yWyjHgaiAFHAJuWI3PXqHuE0BHxbpPA/fHy/cD/1BrnWXa3gLsBg6fTy9wJ/AD3G0t9wD71qj+B4C/qLLvDfHfURrYFv99BTXUvgnYHS83Ay/FGuui/RfRXy/tL0BTvJwE9sXt+m3gnnj9V4A/iZf/FPhKvHwP8Ggt23+5j9XqgZfqpqhqHijWTalH7gYeiZcfAd5TQy3zUNWfAcMVqxfSezfwNXX8CmgTkU2ro7Q6C+hfiLuBb6lqTlWPAz24v7OaoKr9qnogXp4AjuDKTdRF+y+ifyHWWvurqk7GL5PxQ4HfBR6L11e2f/F7eQx4u8hyChvVltUy8Gp1Uxb741grKPBDEXkmrukC0KWq/fHyaaCrNtKWzEJ66+k7+UgcZni4LGS1ZvXHl+M34XqBddf+FfqhTtpfRAIROQicAZ7EXRWMqmoY71KusaQ/3j4GrF9dxSvHD2IuzptVdTfwbuDPROQt5RvVXX/VTR5mvemN+TKwHdgF9AP/WFs5iyMiTcB3gY+p6rwC8PXQ/lX01037q2qkqrtwZT1uBa6rsaRLzmoZ+HLqptQcVe2Ln88A/4H7oxgoXurGz2dqp3BJLKS3Lr4TVR2I/zEt8BBzl+lrTr+IJHHm9w1VfTxeXTftX01/PbV/EVUdBX4CvAkXmipOWCzXWNIfb28Fzq6y1BWzWgZed3VTRKRRRJqLy8C7gMM43R+Kd/sQ8J+1UbhkFtL7BPDBOBtiDzBWdqm/ZqiIC78X9x2A039PnE2wDdgJ/Hq19RWJ46dfBY6o6mfLNtVF+y+kv47af4OItMXLWdyNZo7gjPx98W6V7V/8Xt4H/E98hVRfrNZoKW7U/SVcXOqTtR69XYLeq3Gj7IeA54uacXGyHwMvAz8C2muttUzzN3GXuQVcvO/ehfTiRu2/FH8fzwE3r1H9X4/1PYv7p9tUtv8nY/1HgXfXWPubceGRZ4GD8ePOemn/RfTXS/vfCPwm1nkY+Ot4/dW4H5Ye4DtAOl6fiV/3xNuvrqX+5T78VHqPx+OpU/wgpsfj8dQp3sA9Ho+nTvEG7vF4PHWKN3CPx+OpU7yBezweT53iDdzj8XjqFG/gHo/HU6f8P2xHjrc+/yg4AAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" + "
" ] }, "metadata": { @@ -251,7 +263,7 @@ "source": [ "outputs = synthesizer.infer(phones, tones=tones, global_condition=utterance_embeds)\n", "mel_input = paddle.transpose(outputs[\"mel_outputs_postnet\"], [0, 2, 1])\n", - "fig = display(outputs[\"alignments\"][0].numpy().T)" + "fig = display.plot_alignment(outputs[\"alignments\"][0].numpy().T)" ] }, { @@ -261,31 +273,38 @@ "## 合成语音" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "合成的语音会保存在 `syn_audio` 目录下,使用和 reference 相同的文件名。" + ] + }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "time: 12.234672784805298s\n" + "time: 23.468628406524658s\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 22, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -306,7 +325,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -314,7 +333,7 @@ "text/html": [ "\n", " \n", " " @@ -323,7 +342,7 @@ "" ] }, - "execution_count": 23, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -356,7 +375,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.5" + "version": "3.7.7" } }, "nbformat": 4, diff --git a/parakeet/models/tacotron2.py b/parakeet/models/tacotron2.py index 7759f25..d9980f6 100644 --- a/parakeet/models/tacotron2.py +++ b/parakeet/models/tacotron2.py @@ -205,8 +205,8 @@ class Tacotron2Encoder(nn.Layer): Parameters ---------- - x: Tensor [shape=(B, T)] - Batch of the sequencees of padded character ids. + x: Tensor [shape=(B, T, C)] + Input embeddings. text_lens: Tensor [shape=(B,)], optional Batch of lengths of each text input batch. Defaults to None. @@ -502,7 +502,7 @@ class Tacotron2Decoder(nn.Layer): if int(paddle.argmax(alignment[0])) == encoder_steps - 1: if first_hit_end is None: first_hit_end = i - elif i > (first_hit_end + 10): + elif i > (first_hit_end + 20): print("content exhausted!") break if len(mel_outputs) == max_decoder_steps: