{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import paddle\n", "from matplotlib import pyplot as plt\n", "from IPython import display as ipd\n", "import soundfile as sf\n", "import librosa.display\n", "from parakeet.utils import display\n", "paddle.set_device(\"gpu:0\")\n", "import sys\n", "sys.path.append(\"../../\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 加载模型" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "vocab_phones:\n", " Vocab(size: 68,\n", "stoi:\n", "OrderedDict([('', 0), ('', 1), ('', 2), ('', 3), ('$', 4), ('%', 5), ('&r', 6), ('a', 7), ('ai', 8), ('an', 9), ('ang', 10), ('ao', 11), ('b', 12), ('c', 13), ('ch', 14), ('d', 15), ('e', 16), ('ea', 17), ('ei', 18), ('en', 19), ('eng', 20), ('er', 21), ('f', 22), ('g', 23), ('h', 24), ('i', 25), ('ia', 26), ('iai', 27), ('ian', 28), ('iang', 29), ('iao', 30), ('ie', 31), ('ien', 32), ('ieng', 33), ('ii', 34), ('iii', 35), ('io', 36), ('iou', 37), ('j', 38), ('k', 39), ('l', 40), ('m', 41), ('n', 42), ('o', 43), ('ou', 44), ('p', 45), ('q', 46), ('r', 47), ('s', 48), ('sh', 49), ('t', 50), ('u', 51), ('ua', 52), ('uai', 53), ('uan', 54), ('uang', 55), ('uei', 56), ('uen', 57), ('ueng', 58), ('uo', 59), ('v', 60), ('van', 61), ('ve', 62), ('ven', 63), ('veng', 64), ('x', 65), ('z', 66), ('zh', 67)]))\n", "vocab_tones:\n", " Vocab(size: 10,\n", "stoi:\n", "OrderedDict([('', 0), ('', 1), ('', 2), ('', 3), ('0', 4), ('1', 5), ('2', 6), ('3', 7), ('4', 8), ('5', 9)]))\n" ] } ], "source": [ "from examples.ge2e.audio_processor import SpeakerVerificationPreprocessor\n", "from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder\n", "\n", "# speaker encoder\n", "p = SpeakerVerificationPreprocessor(\n", " sampling_rate=16000, \n", " audio_norm_target_dBFS=-30, \n", " vad_window_length=30, \n", " vad_moving_average_width=8, \n", " vad_max_silence_length=6, \n", " mel_window_length=25, \n", " mel_window_step=10, \n", " n_mels=40, \n", " partial_n_frames=160, \n", " min_pad_coverage=0.75, \n", " partial_overlap_ratio=0.5)\n", "speaker_encoder = LSTMSpeakerEncoder(n_mels=40, num_layers=3, hidden_size=256, output_size=256)\n", "speaker_encoder_params_path = \"../../pretrained/ge2e/ge2e_ckpt_0.3/step-3000000.pdparams\"\n", "speaker_encoder.set_state_dict(paddle.load(speaker_encoder_params_path))\n", "speaker_encoder.eval()\n", "\n", "# synthesizer\n", "from parakeet.models.tacotron2 import Tacotron2\n", "from examples.tacotron2_aishell3.chinese_g2p import convert_sentence\n", "from examples.tacotron2_aishell3.aishell3 import voc_phones, voc_tones\n", "\n", "synthesizer = Tacotron2(\n", " vocab_size=68,\n", " n_tones=10,\n", " d_mels= 80,\n", " d_encoder= 512,\n", " encoder_conv_layers = 3,\n", " encoder_kernel_size= 5,\n", " d_prenet= 256,\n", " d_attention_rnn= 1024,\n", " d_decoder_rnn = 1024,\n", " attention_filters = 32,\n", " attention_kernel_size = 31,\n", " d_attention= 128,\n", " d_postnet = 512,\n", " postnet_kernel_size = 5,\n", " postnet_conv_layers = 5,\n", " reduction_factor = 1,\n", " p_encoder_dropout = 0.5,\n", " p_prenet_dropout= 0.5,\n", " p_attention_dropout= 0.1,\n", " p_decoder_dropout= 0.1,\n", " p_postnet_dropout= 0.5,\n", " d_global_condition=256,\n", " use_stop_token=False,\n", ")\n", "params_path = \"../../pretrained/tacotron2_aishell3/tacotron2_aishell3_ckpt_0.3/step-450000.pdparams\"\n", "synthesizer.set_state_dict(paddle.load(params_path))\n", "synthesizer.eval()\n", "\n", "# vocoder\n", "from parakeet.models import ConditionalWaveFlow\n", "vocoder = ConditionalWaveFlow(upsample_factors=[16, 16], n_flows=8, n_layers=8, n_group=16, channels=128, n_mels=80, kernel_size=[3, 3])\n", "params_path = \"../../pretrained/waveflow/waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams\"\n", "vocoder.set_state_dict(paddle.load(params_path))\n", "vocoder.eval()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 生成 speaker encoding" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "首先在当前文件夹下新建文件夹 `ref_audio`,把要作为参考的音频存在在这个文件夹中。格式要求是 wav 格式,采样率会被重采样至 16kHz." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ref_name = \"女声2.wav\"\n", "ref_audio_path = f\"./ref_audio/{ref_name}\"\n", "ipd.Audio(ref_audio_path, normalize=True)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mel_sequences: (2, 160, 40)\n", "embed shape: [256]\n" ] } ], "source": [ "mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path))\n", "print(\"mel_sequences: \", mel_sequences.shape)\n", "with paddle.no_grad():\n", " embed = speaker_encoder.embed_utterance(paddle.to_tensor(mel_sequences))\n", "print(\"embed shape: \", embed.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 合成频谱" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "因为 AISHELL-3 数据集中使用 `%` 和 `$` 表示韵律词和韵律短语的边界,它们大约对应着较短和较长的停顿,在文本中可以使用 `%` 和 `$` 来调节韵律。\n", "\n", "值得的注意的是,句子的有效字符集仅包含汉字和 `%`, `$`, 因此输入的句子只能包含这些字符。" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['m', 'ei', 'd', 'ang', 'n', 'i', 'j', 've', 'd', 'e', '%', 'x', 'iang', 'iao', 'p', 'i', 'p', 'ieng', 'sh', 'en', 'm', 'e', 'r', 'en', 'd', 'e', 'sh', 'iii', 'h', 'ou', '$', 'n', 'i', 'q', 'ie', 'iao', 'j', 'i', 'zh', 'e', '%', 'zh', 'e', 'g', 'e', 'sh', 'iii', 'j', 'ie', 'sh', 'ang', 'd', 'e', 'r', 'en', '%', 'b', 'ieng', 'f', 'ei', 'd', 'ou', 'j', 'v', 'b', 'ei', 'n', 'i', 'b', 'ieng', 'iou', 'd', 'e', 't', 'iao', 'j', 'ian', '$']\n", "['0', '3', '0', '1', '0', '3', '0', '2', '0', '5', '0', '0', '3', '4', '0', '1', '0', '2', '0', '2', '0', '5', '0', '2', '0', '5', '0', '2', '0', '4', '0', '0', '3', '0', '4', '4', '0', '4', '0', '5', '0', '0', '4', '0', '4', '0', '4', '0', '4', '0', '4', '0', '5', '0', '2', '0', '0', '4', '0', '1', '0', '1', '0', '4', '0', '4', '0', '3', '0', '3', '3', '0', '5', '0', '2', '0', '4', '0']\n" ] } ], "source": [ "sentence = \"每当你觉得%想要批评什么人的时候$你切要记着%这个世界上的人%并非都具备你禀有的条件$\"\n", "phones, tones = convert_sentence(sentence)\n", "print(phones)\n", "print(tones)\n", "\n", "phones = np.array([voc_phones.lookup(item) for item in phones], dtype=np.int64)\n", "tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64)\n", "\n", "phones = paddle.to_tensor(phones).unsqueeze(0)\n", "tones = paddle.to_tensor(tones).unsqueeze(0)\n", "utterance_embeds = paddle.unsqueeze(embed, 0)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 73%|███████▎ | 733/1000 [00:02<00:01, 255.71it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "content exhausted!\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "outputs = synthesizer.infer(phones, tones=tones, global_condition=utterance_embeds)\n", "mel_input = paddle.transpose(outputs[\"mel_outputs_postnet\"], [0, 2, 1])\n", "fig = display.plot_alignment(outputs[\"alignments\"][0].numpy().T)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 合成语音" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "合成的语音会保存在 `syn_audio` 目录下,使用和 reference 相同的文件名。" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "time: 19.793312788009644s\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "with paddle.no_grad():\n", " wav = vocoder.infer(mel_input)\n", "wav = wav.numpy()[0]\n", "sf.write(f\"syn_audio/{ref_name}\", wav, samplerate=22050)\n", "librosa.display.waveplot(wav)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ipd.Audio(wav, rate=22050)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }