Compare commits
185 Commits
Author | SHA1 | Date |
---|---|---|
|
87a0cec0c7 | |
|
3aa6aed0fb | |
|
4f288a6d4f | |
|
0aa7088d36 | |
|
6f1c534557 | |
|
c6504ade5a | |
|
e69ab88fe6 | |
|
dc055bde0a | |
|
7b0de356f9 | |
|
3ebe5ccb33 | |
|
c955c4192b | |
|
a3de28cbe0 | |
|
0af7402daa | |
|
f423323bae | |
|
3f60b6e0a3 | |
|
30e3b9172f | |
|
a0ce65211c | |
|
c5acfbd8eb | |
|
087d7bf16e | |
|
cf892c5ed7 | |
|
73374528d0 | |
|
b017c73100 | |
|
eed6f9af08 | |
|
a18bb23f30 | |
|
f78a20c4a0 | |
|
f751e3cfb6 | |
|
27cba27d1b | |
|
c522e56e86 | |
|
9d59de0f3b | |
|
8ba7eeb1da | |
|
4fde5c7e64 | |
|
c321fcd098 | |
|
641be1bc92 | |
|
c2a279c433 | |
|
353212ebde | |
|
7c5e98dfb3 | |
|
ede6835bd2 | |
|
e53b9a0745 | |
|
c615de2354 | |
|
ddfe2eda76 | |
|
39007e5bf8 | |
|
b0ba6e7bf9 | |
|
61ac117df5 | |
|
e88cbace1c | |
|
91c54575fe | |
|
7b9e9c7a67 | |
|
737b09d03c | |
|
f5027a5e6f | |
|
d2dba13ab7 | |
|
3df4ecd455 | |
|
d1d6c20672 | |
|
f9b39b97dd | |
|
46879b291b | |
|
c1de6a1e49 | |
|
80bf04b710 | |
|
9d06ec2d91 | |
|
2421a936ed | |
|
2b31cd4f21 | |
|
2b3996c64d | |
|
eb5b43691f | |
|
51f2753c15 | |
|
fe7ddc2aaf | |
|
bb64e4659a | |
|
e03e96d9e4 | |
|
c866bb0b57 | |
|
2c952fbd70 | |
|
f31643b33c | |
|
aa205fd7bb | |
|
18709adce8 | |
|
b6efb43990 | |
|
f2a35a17d4 | |
|
badf72d611 | |
|
544594ec54 | |
|
84ad4c9e65 | |
|
d08eb72791 | |
|
255ddcfe32 | |
|
cf43f2cf03 | |
|
0327874f19 | |
|
949dfa2f3d | |
|
28fbc60737 | |
|
63285dc80f | |
|
c2bc4b0474 | |
|
1af9127ee6 | |
|
dd2c5cc6c6 | |
|
310366bb54 | |
|
163b6f5dc3 | |
|
3baffa5f4c | |
|
6b8573898a | |
|
ecdeb14a40 | |
|
d78a8b4e1e | |
|
d81df88173 | |
|
49c9cb38be | |
|
bbc50faef2 | |
|
afc476d8c3 | |
|
b82217f50f | |
|
6420da6197 | |
|
ddd9cdfbd8 | |
|
bdf60bec39 | |
|
a5c81c75d5 | |
|
c864612dc3 | |
|
bf320849bc | |
|
5b93de8a2e | |
|
ab56eac676 | |
|
3a19150344 | |
|
814d047129 | |
|
796e0b1e1f | |
|
b2bd479f46 | |
|
026ae1078b | |
|
b4533af207 | |
|
99fdd10b5d | |
|
4de58f4a99 | |
|
a079e767df | |
|
1d2e93c75f | |
|
a8b10f50fb | |
|
a5f1605051 | |
|
cecc8735c4 | |
|
09f1840082 | |
|
fb64c79f7a | |
|
e30d7ad48f | |
|
07ce84c680 | |
|
a1b827460c | |
|
f375792c51 | |
|
e29502f634 | |
|
b12eda8423 | |
|
29cc759241 | |
|
4893c9c086 | |
|
f255eee029 | |
|
37d4475810 | |
|
62959759f9 | |
|
0287f46532 | |
|
d3761683e1 | |
|
a4a0bd8c98 | |
|
c57e8e7350 | |
|
a6806389f9 | |
|
e87bfb7d05 | |
|
3ca037453e | |
|
a29c74d036 | |
|
4df5ad42f6 | |
|
810f979dba | |
|
6edc7d8474 | |
|
404add2caa | |
|
9cb5c03069 | |
|
598d813908 | |
|
2ed26d3416 | |
|
ce29ac68b3 | |
|
d190ce8d7f | |
|
6101c6ac86 | |
|
5e11ce0dcd | |
|
73a2cadc36 | |
|
8af831ae3c | |
|
5b5eaaadac | |
|
fb49c1e77d | |
|
2dce0887b3 | |
|
49231ca8e5 | |
|
db7598c702 | |
|
abee3ecdd4 | |
|
b65cc4d8dc | |
|
a01200e437 | |
|
c7e5aaa540 | |
|
0e35119453 | |
|
c8622b4699 | |
|
e470cda881 | |
|
01f30d7cc8 | |
|
7822a89fec | |
|
098d3795c2 | |
|
a9177cd6c2 | |
|
af4da7dd9e | |
|
e07441c193 | |
|
8094578f6d | |
|
0cdad602e2 | |
|
1f71f65c28 | |
|
68f5e1de15 | |
|
45d6f3b99d | |
|
57d820f055 | |
|
36cc543348 | |
|
c43216ae9b | |
|
2a764d9a10 | |
|
580655f33f | |
|
c1e0aecdde | |
|
6aa7af1aa4 | |
|
53d0382fc7 | |
|
5270774bb0 | |
|
40457227e6 | |
|
f9087ea9a2 | |
|
a8192c79cc |
|
@ -1,3 +1,9 @@
|
|||
# IDES
|
||||
*.wpr
|
||||
*.wpu
|
||||
*.udb
|
||||
*.ann
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
# .readthedocs.yml
|
||||
# Read the Docs configuration file
|
||||
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
|
||||
|
||||
# Required
|
||||
version: 2
|
||||
|
||||
# Build documentation in the docs/ directory with Sphinx
|
||||
sphinx:
|
||||
configuration: docs/source/conf.py
|
||||
|
||||
# Build documentation with MkDocs
|
||||
#mkdocs:
|
||||
# configuration: mkdocs.yml
|
||||
|
||||
# Optionally build your docs in additional formats such as PDF
|
||||
formats: []
|
||||
|
||||
# Optionally set the version of Python and requirements required to build your docs
|
||||
python:
|
||||
version: 3.7
|
||||
install:
|
||||
- method: pip
|
||||
path: .
|
||||
extra_requirements:
|
||||
- doc
|
||||
|
||||
- requirements: docs/requirements.txt
|
||||
|
||||
|
224
README.md
224
README.md
|
@ -3,7 +3,7 @@
|
|||
Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It is built on PaddlePaddle Fluid dynamic graph and includes many influential TTS models proposed by [Baidu Research](http://research.baidu.com) and other research groups.
|
||||
|
||||
<div align="center">
|
||||
<img src="images/logo.png" width=450 /> <br>
|
||||
<img src="images/logo.png" width=300 /> <br>
|
||||
</div>
|
||||
|
||||
In particular, it features the latest [WaveFlow](https://arxiv.org/abs/1912.01219) model proposed by Baidu Research.
|
||||
|
@ -18,17 +18,15 @@ In order to facilitate exploiting the existing TTS models directly and developin
|
|||
|
||||
- Vocoders
|
||||
- [WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219)
|
||||
- [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](https://arxiv.org/abs/1807.07281)
|
||||
- [WaveNet: A Generative Model for Raw Audio](https://arxiv.org/abs/1609.03499)
|
||||
|
||||
- TTS models
|
||||
- [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654)
|
||||
- [Neural Speech Synthesis with Transformer Network (Transformer TTS)](https://arxiv.org/abs/1809.08895)
|
||||
- [FastSpeech: Fast, Robust and Controllable Text to Speech](https://arxiv.org/abs/1905.09263)
|
||||
- [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](arxiv.org/abs/1712.05884)
|
||||
|
||||
And more will be added in the future.
|
||||
## Updates
|
||||
|
||||
May-07-2021, Add an example for voice cloning in Chinese. Check [examples/tacotron2_aishell3](./examples/tacotron2_aishell3).
|
||||
|
||||
See the [guide](docs/experiment_guide.md) for details about how to build your own model and experiment in Parakeet.
|
||||
|
||||
## Setup
|
||||
|
||||
|
@ -40,221 +38,57 @@ sudo apt-get install libsndfile1
|
|||
|
||||
### Install PaddlePaddle
|
||||
|
||||
See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. This repo requires PaddlePaddle **1.8.2** or above.
|
||||
See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. This repo requires PaddlePaddle **2.0.0rc1** or above.
|
||||
|
||||
### Install Parakeet
|
||||
```bash
|
||||
pip install -U paddle-parakeet
|
||||
```
|
||||
|
||||
or
|
||||
```bash
|
||||
git clone https://github.com/PaddlePaddle/Parakeet
|
||||
cd Parakeet
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
### Install CMUdict for nltk
|
||||
|
||||
CMUdict from nltk is used to transform text into phonemes.
|
||||
|
||||
```python
|
||||
import nltk
|
||||
nltk.download("punkt")
|
||||
nltk.download("cmudict")
|
||||
```
|
||||
See [install](https://paddle-parakeet.readthedocs.io/en/latest/install.html) for more details.
|
||||
|
||||
## Examples
|
||||
|
||||
Entries to the introduction, and the launch of training and synthsis for different example models:
|
||||
|
||||
- [>>> WaveFlow](./examples/waveflow)
|
||||
- [>>> Clarinet](./examples/clarinet)
|
||||
- [>>> WaveNet](./examples/wavenet)
|
||||
- [>>> Deep Voice 3](./examples/deepvoice3)
|
||||
- [>>> Transformer TTS](./examples/transformer_tts)
|
||||
- [>>> FastSpeech](./examples/fastspeech)
|
||||
- [>>> Tacotron2](./examples/tacotron2)
|
||||
- [>>> Tacotron2_AISHELL3](./examples/tacotron2_aishell3)
|
||||
- [>>> GE2E](./examples/ge2e)
|
||||
|
||||
|
||||
## Pre-trained models and audio samples
|
||||
## Audio samples
|
||||
|
||||
Parakeet also releases some well-trained parameters for the example models, which can be accessed in the following tables. Each column of these tables lists resources for one model, including the url link to the pre-trained model, the dataset that the model is trained on, and synthesized audio samples based on the pre-trained model. Click each model name to download, then you can get the compressed package which contains the pre-trained model and the `yaml` config describing how the model is trained.
|
||||
### TTS models (Acoustic Model + Neural Vocoder)
|
||||
|
||||
#### Vocoders
|
||||
|
||||
We provide the model checkpoints of WaveFlow with 64, 96 and 128 residual channels, ClariNet and WaveNet.
|
||||
|
||||
<div align="center">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_ckpt_1.0.zip">WaveFlow (res. channels 64)</a>
|
||||
</th>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res96_ljspeech_ckpt_1.0.zip">WaveFlow (res. channels 96)</a>
|
||||
</th>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_ckpt_1.0.zip">WaveFlow (res. channels 128)</a>
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>LJSpeech </th>
|
||||
<th>LJSpeech </th>
|
||||
<th>LJSpeech </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_samples_1.0/step_3020k_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_samples_1.0/step_3020k_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_samples_1.0/step_3020k_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_samples_1.0/step_3020k_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_samples_1.0/step_3020k_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
<th>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res96_ljspeech_samples_1.0/step_2000k_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res96_ljspeech_samples_1.0/step_2000k_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res96_ljspeech_samples_1.0/step_2000k_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res96_ljspeech_samples_1.0/step_2000k_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res96_ljspeech_samples_1.0/step_2000k_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
<th>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
</tr>
|
||||
</tbody>
|
||||
<thead>
|
||||
<tr>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/clarinet_ljspeech_ckpt_1.0.zip">ClariNet</a>
|
||||
</th>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_ckpt_1.0.zip">WaveNet</a>
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>LJSpeech </th>
|
||||
<th>LJSpeech </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/clarinet_ljspeech_samples_1.0/step_500000_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/clarinet_ljspeech_samples_1.0/step_500000_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/clarinet_ljspeech_samples_1.0/step_500000_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/clarinet_ljspeech_samples_1.0/step_500000_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/clarinet_ljspeech_samples_1.0/step_500000_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
<th>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_samples_1.0/step_2450k_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_samples_1.0/step_2450k_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_samples_1.0/step_2450k_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_samples_1.0/step_2450k_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_samples_1.0/step_2450k_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
Check our [website](https://paddle-parakeet.readthedocs.io/en/latest/demo.html) for audio sampels.
|
||||
|
||||
|
||||
**Note:** The input mel spectrogams are from validation dataset, which are not seen during training.
|
||||
## Checkpoints
|
||||
|
||||
#### TTS models
|
||||
### Tacotron2
|
||||
1. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
|
||||
2. [tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
|
||||
|
||||
We also provide checkpoints for different end-to-end TTS models, and present the synthesized audio examples for some randomly chosen famous quotes. The corresponding texts are displayed as follows.
|
||||
### Tacotron2_AISHELL3
|
||||
1. [tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)
|
||||
|
||||
||Text | From |
|
||||
|:-:|:-- | :--: |
|
||||
0|*Life was like a box of chocolates, you never know what you're gonna get.* | *Forrest Gump* |
|
||||
1|*With great power there must come great responsibility.* | *Spider-Man*|
|
||||
2|*To be or not to be, that’s a question.*|*Hamlet*|
|
||||
3|*Death is just a part of life, something we're all destined to do.*| *Forrest Gump*|
|
||||
4|*Don’t argue with the people of strong determination, because they may change the fact!*| *William Shakespeare* |
|
||||
### TransformerTTS
|
||||
1. [transformer_tts_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.3.zip)
|
||||
|
||||
Users have the option to use different vocoders to convert the linear/mel spectrogam to the raw audio in TTS models. Taking this into account, we are going to release the checkpoints for TTS models adapted to different vocoders, including the [Griffin-Lim](https://ieeexplore.ieee.org/document/1164317) algorithm and some neural vocoders.
|
||||
### WaveFlow
|
||||
1. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)
|
||||
|
||||
##### 1) Griffin-Lim
|
||||
|
||||
<div align="center">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_1.0.zip">Transformer TTS</a>
|
||||
</th>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech_ljspeech_ckpt_1.0.zip">FastSpeech</a>
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>LJSpeech </th>
|
||||
<th>LJSpeech </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th >
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_griffin-lim_samples_1.0/step_120000_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_griffin-lim_samples_1.0/step_120000_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_griffin-lim_samples_1.0/step_120000_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_griffin-lim_samples_1.0/step_120000_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_griffin-lim_samples_1.0/step_120000_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
<th >
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech_ljspeech_griffin-lim_samples_1.0/step_162000_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech_ljspeech_griffin-lim_samples_1.0/step_162000_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech_ljspeech_griffin-lim_samples_1.0/step_162000_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech_ljspeech_griffin-lim_samples_1.0/step_162000_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech_ljspeech_griffin-lim_samples_1.0/step_162000_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
</tr>
|
||||
</tbody>
|
||||
<thead>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
##### 2) Neural vocoders
|
||||
|
||||
under preparation
|
||||
### GE2E
|
||||
1. [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
|
||||
|
||||
## Copyright and License
|
||||
|
||||
|
|
|
@ -0,0 +1,233 @@
|
|||
# Parakeet
|
||||
|
||||
Parakeet 自在为开源社区提供一个灵活,高效,先进的语音合成工具箱。Parakeet 基于 PaddlePaddle 2.0 构建,并且包含了 [百度研究院]((http://research.baidu.com)) 以及其他研究机构的许多有影响力的 TTS 模型。
|
||||
|
||||
<img src="./images/logo.png" alt="parakeet-logo" style="zoom: 33%;" />
|
||||
|
||||
其中包含了百度研究院最近提出的 [WaveFlow](https://arxiv.org/abs/1912.01219) 模型。
|
||||
|
||||
- WaveFlow 无需专用于推理的 kernel, 就可以在 Nvidia v100 上以 40 倍实时的速度合成 22.05kHz 的高保真度的语音。这比 [WaveGlow](https://github.com/NVIDIA/waveglow) 模型更快,而且比 WaveNet 快几个数量级。
|
||||
- WaveFlow 是占用小的,基于流的用于生成原始音频的模型,只有 5.9M 个可训练参数,约为 WaveGlow (87.9M 个参数) 的 1/15.
|
||||
- WaveFlow 可以直接通过最大似然方式训练,而不需要概率密度蒸馏,或者是类似 ParallelWaveNet 和 ClariNet 中使用的辅助 loss, 这简化了训练流程,减小了开发成本。
|
||||
|
||||
## 模型概览
|
||||
|
||||
为了方便使用已有的 TTS 模型以及开发新的模型,Parakeet 选取了经典的模型,并且提供了基于 PaddlePaddle 的参考实现。Parakeet 进一步抽象了 TTS 任务的流程,并且将数据预处理,模块共享,模型配置以及训练和合成的流程标准化。目前已经支持的模型包括音码器 (vocoder) 和声学模型。
|
||||
|
||||
- 音码器
|
||||
- [WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219)
|
||||
- [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](https://arxiv.org/abs/1807.07281)
|
||||
- [WaveNet: A Generative Model for Raw Audio](https://arxiv.org/abs/1609.03499)
|
||||
|
||||
- 声学模型
|
||||
- [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654)
|
||||
- [Neural Speech Synthesis with Transformer Network (Transformer TTS)](https://arxiv.org/abs/1809.08895)
|
||||
- [FastSpeech: Fast, Robust and Controllable Text to Speech](https://arxiv.org/abs/1905.09263)
|
||||
|
||||
未来将会添加更多的模型。
|
||||
|
||||
如若需要基于 Parakeet 实现自己的模型和实验,可以参考 [如何准备自己的实验](./docs/experiment_guide_cn.md).
|
||||
|
||||
## 安装
|
||||
|
||||
请参考 [安装](./docs/installation_cn.md).
|
||||
|
||||
## 实验样例
|
||||
|
||||
Parakeet 提供了多个实验样例。这些样例使用 parakeet 中提供的模型,提供在公共数据集上进行实验的完整流程,包含数据处理,模型训练以及预测的功能,是进行实验以及二次开发的示例。
|
||||
|
||||
- [>>> WaveFlow](./examples/waveflow)
|
||||
- [>>> Clarinet](./examples/clarinet)
|
||||
- [>>> WaveNet](./examples/wavenet)
|
||||
- [>>> Deep Voice 3](./examples/deepvoice3)
|
||||
- [>>> Transformer TTS](./examples/transformer_tts)
|
||||
- [>>> FastSpeech](./examples/fastspeech)
|
||||
|
||||
|
||||
## 预训练模型和音频样例
|
||||
|
||||
Parakeet 同时提供了示例模型的训练好的参数,可从下表中获取。每一列列出了一个模型的资源,包含预训练模型的 checkpoint 下载 url, 训练该模型用的数据集,以及使用改 checkpoint 合成的语音样例。点击模型名,可以下载到一个压缩包,其中包含了训练该模型时使用的配置文件。
|
||||
|
||||
#### 音码器
|
||||
|
||||
我们提供了 residual channel 为 64, 96, 128 的 WaveFlow 模型 checkpoint. 另外还提供了 ClariNet 和 WaveNet 的 checkpoint.
|
||||
|
||||
<div align="center">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_ckpt_1.0.zip">WaveFlow (res. channels 64)</a>
|
||||
</th>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res96_ljspeech_ckpt_1.0.zip">WaveFlow (res. channels 96)</a>
|
||||
</th>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_ckpt_1.0.zip">WaveFlow (res. channels 128)</a>
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>LJSpeech </th>
|
||||
<th>LJSpeech </th>
|
||||
<th>LJSpeech </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_samples_1.0/step_3020k_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_samples_1.0/step_3020k_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_samples_1.0/step_3020k_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_samples_1.0/step_3020k_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_samples_1.0/step_3020k_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
<th>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res96_ljspeech_samples_1.0/step_2000k_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res96_ljspeech_samples_1.0/step_2000k_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res96_ljspeech_samples_1.0/step_2000k_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res96_ljspeech_samples_1.0/step_2000k_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res96_ljspeech_samples_1.0/step_2000k_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
<th>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
</tr>
|
||||
</tbody>
|
||||
<thead>
|
||||
<tr>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/clarinet_ljspeech_ckpt_1.0.zip">ClariNet</a>
|
||||
</th>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_ckpt_1.0.zip">WaveNet</a>
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>LJSpeech </th>
|
||||
<th>LJSpeech </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/clarinet_ljspeech_samples_1.0/step_500000_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/clarinet_ljspeech_samples_1.0/step_500000_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/clarinet_ljspeech_samples_1.0/step_500000_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/clarinet_ljspeech_samples_1.0/step_500000_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/clarinet_ljspeech_samples_1.0/step_500000_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
<th>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_samples_1.0/step_2450k_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_samples_1.0/step_2450k_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_samples_1.0/step_2450k_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_samples_1.0/step_2450k_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_samples_1.0/step_2450k_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
|
||||
**注意:** 输入的 mel 频谱是从验证集中选取的,它们不被用于训练。
|
||||
|
||||
#### 声学模型
|
||||
|
||||
我们也提供了几个端到端的 TTS 模型的 checkpoint, 并展示用随机选取的著名引言合成的语音。对应的转录文本展示如下。
|
||||
|
||||
| |Text| From |
|
||||
|:-:|:-- | :--: |
|
||||
0|*Life was like a box of chocolates, you never know what you're gonna get.* | *Forrest Gump* |
|
||||
1|*With great power there must come great responsibility.* | *Spider-Man*|
|
||||
2|*To be or not to be, that’s a question.*|*Hamlet*|
|
||||
3|*Death is just a part of life, something we're all destined to do.*| *Forrest Gump*|
|
||||
4|*Don’t argue with the people of strong determination, because they may change the fact!*| *William Shakespeare* |
|
||||
|
||||
用于可以使用不同的音码器将声学模型产生的频谱转化为原始音频。我们将展示声学模型配合 [Griffin-Lim](https://ieeexplore.ieee.org/document/1164317) 音码器以及基于神经网络的音码器的合成样例。
|
||||
|
||||
##### 1) Griffin-Lim 音码器
|
||||
|
||||
<div align="center">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_1.0.zip">Transformer TTS</a>
|
||||
</th>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech_ljspeech_ckpt_1.0.zip">FastSpeech</a>
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>LJSpeech </th>
|
||||
<th>LJSpeech </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th >
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_griffin-lim_samples_1.0/step_120000_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_griffin-lim_samples_1.0/step_120000_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_griffin-lim_samples_1.0/step_120000_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_griffin-lim_samples_1.0/step_120000_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_griffin-lim_samples_1.0/step_120000_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
<th >
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech_ljspeech_griffin-lim_samples_1.0/step_162000_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech_ljspeech_griffin-lim_samples_1.0/step_162000_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech_ljspeech_griffin-lim_samples_1.0/step_162000_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech_ljspeech_griffin-lim_samples_1.0/step_162000_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech_ljspeech_griffin-lim_samples_1.0/step_162000_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
</tr>
|
||||
</tbody>
|
||||
<thead>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
##### 2) 神经网络音码器
|
||||
|
||||
正在开发中。
|
||||
|
||||
## 版权和许可
|
||||
|
||||
Parakeet 以 [Apache-2.0 license](LICENSE) 提供。
|
|
@ -0,0 +1,20 @@
|
|||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line, and also
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = source
|
||||
BUILDDIR = build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
341
docs/data.md
341
docs/data.md
|
@ -1,341 +0,0 @@
|
|||
# parakeet.data
|
||||
|
||||
This short guide shows the design of `parakeet.data` and how we use it in an experiment.
|
||||
|
||||
The most important concepts of `parakeet.data` are `DatasetMixin`, `DataCargo`, `Sampler`, `batch function` and `DataIterator`.
|
||||
|
||||
## Dataset
|
||||
|
||||
Dataset, as we assume here, is a list of examples. You can get its length by `len(dataset)`(which means its length is known, and we have to implement `__len__()` method for it). And you can access its items randomly by `dataset[i]`(which means we have to implement `__getitem__()` method for it). Furthermore, you can iterate over it by `iter(dataset)` or `for example in dataset`, which means we have to implement `__iter__()` method for it.
|
||||
|
||||
### DatasetMixin
|
||||
|
||||
We provide an `DatasetMixin` object which provides the above methods. You can inherit `DatasetMixin` and implement `get_example()` method for it to define your own dataset class. The `get_example()` method is called by `__getitem__()` method automatically.
|
||||
|
||||
We also define several high-order Dataset classes, the obejcts of which can be built from some given Dataset objects.
|
||||
|
||||
### TupleDataset
|
||||
|
||||
Dataset that is a combination of several datasets of the same length. An example of a `Tupledataset` is a tuple of examples of its constituent datasets.
|
||||
|
||||
### DictDataset
|
||||
|
||||
Dataset that is a combination of several datasets of the same length. An example of the `Dictdataset` is a dict of examples of its constituent datasets.
|
||||
|
||||
### SliceDataset
|
||||
|
||||
`SliceDataset` is a slice of the base dataset.
|
||||
|
||||
### SubsetDataset
|
||||
|
||||
`SubsetDataset` is a subset of the base dataset.
|
||||
|
||||
### ChainDataset
|
||||
|
||||
`ChainDataset` is the concatenation of several datastes with the same fields.
|
||||
|
||||
### TransformDataset
|
||||
|
||||
A `TransformeDataset` is created by applying a `transform` to the examples of the base dataset. The `transform` is a callable object which takes an example of the base dataset as parameter and returns an example of the `TransformDataset`. The transformation is lazy, which means it is applied to an example only when requested.
|
||||
|
||||
### FilterDataset
|
||||
|
||||
A `FilterDataset` is created by applying a `filter` to the base dataset. A `filter` is a predicate that takes an example of the base dataset as parameter and returns a boolean. Only those examples that pass the filter are included in the `FilterDataset`.
|
||||
|
||||
Note that the filter is applied to all the examples in the base dataset when initializing a `FilterDataset`.
|
||||
|
||||
### CacheDataset
|
||||
|
||||
By default, we preprocess dataset lazily in `DatasetMixin.get_example()`. An example is preprocessed whenever requested. But `CacheDataset` caches the base dataset lazily, so each example is processed only once when it is first requested. When preprocessing the dataset is slow, you can use `Cachedataset` to speed it up, but caching may consume a lot of RAM if the dataset is large.
|
||||
|
||||
Finally, if preprocessing the dataset is slow and the processed dataset is too large to cache, you can write your own code to save them into files or databases, and then define a Dataset to load them. `Dataset` is flexible, so you can create your own dataset painlessly.
|
||||
|
||||
## DataCargo
|
||||
|
||||
`DataCargo`, like `Dataset`, is an iterable object, but it is an iterable oject of batches. We need `Datacargo` because in deep learning, batching examples into batches exploits the computational resources of modern hardwares. You can iterate over it by `iter(datacargo)` or `for batch in datacargo`. `DataCargo` is an iterable object but not an iterator, in that in can be iterated over more than once.
|
||||
|
||||
### batch function
|
||||
|
||||
The concept of a `batch` is something transformed from a list of examples. Assume that an example is a structure(tuple in python, or struct in C and C++) consists of several fields, then a list of examples is an array of structures(AOS, e.g. a dataset is an AOS). Then a batch here is a structure of arrays (SOA). Here is an example:
|
||||
|
||||
The table below represents 2 examples, each of which contains 5 fields.
|
||||
|
||||
| weight | height | width | depth | density |
|
||||
| ------ | ------ | ----- | ----- | ------- |
|
||||
| 1.2 | 1.1 | 1.3 | 1.4 | 0.8 |
|
||||
| 1.6 | 1.4 | 1.2 | 0.6 | 1.4 |
|
||||
|
||||
The AOS representation and SOA representation of the table are shown below.
|
||||
|
||||
AOS:
|
||||
```text
|
||||
[(1.2, 1,1, 1,3, 1,4, 0.8),
|
||||
|
||||
(1.6, 1.4, 1.2, 0.6, 1.4)]
|
||||
```
|
||||
|
||||
SOA:
|
||||
```text
|
||||
([1,2, 1.6],
|
||||
[1.1, 1.4],
|
||||
[1.3, 1.2],
|
||||
[1.4, 0.6],
|
||||
[0.8, 1.4])
|
||||
```
|
||||
|
||||
For the example above, converting an AOS to an SOA is trivial, just stacking every field for all the examples. But it is not always the case. When a field contains a sequence, you may have to pad all the sequences to the largest length then stack them together. In some other cases, we may want to add a field for the batch, for example, `valid_length` for each example. So in general, a function to transform an AOS to SOA is needed to build a `Datacargo` from a dataset. We call this the batch function (`batch_fn`), but you can use any callable object if you need to.
|
||||
|
||||
Usually we need to define the batch function as an callable object which stores all the options and configurations as its members. Its `__call__()` method transforms a list of examples into a batch.
|
||||
|
||||
### Sampler
|
||||
|
||||
Equipped with a batch function(we have known __how to batch__), here comes the next question. __What to batch?__ We need to decide which examples to pick when creating a batch. Since a dataset is a list of examples, we only need to pick indices for the corresponding examples. A sampler object is what we use to do this.
|
||||
|
||||
A `Sampler` is represented as an iterable object of integers. Assume the dataset has `N` examples, then an iterable object of intergers in the range`[0, N)` is an appropriate sampler for this dataset to build a `DataCargo`.
|
||||
|
||||
We provide several samplers that are ready to use, for example, `SequentialSampler` and `RandomSampler`.
|
||||
|
||||
## DataIterator
|
||||
|
||||
`DataIterator` is what returned by `iter(data_cargo)`. It can only be iterated over once.
|
||||
|
||||
Here's the analogy.
|
||||
|
||||
```text
|
||||
Dataset --> Iterable[Example] | iter(Dataset) -> Iterator[Example]
|
||||
DataCargo --> Iterable[Batch] | iter(DataCargo) -> Iterator[Batch]
|
||||
```
|
||||
|
||||
In order to construct an iterator of batches from an iterator of examples, we construct a DataCargo from a Dataset.
|
||||
|
||||
|
||||
|
||||
## Code Example
|
||||
|
||||
Here's an example of how we use `parakeet.data` to process the `LJSpeech` dataset with a wavenet model.
|
||||
|
||||
First, we would like to define a class which represents the LJSpeech dataset and loads it as-is. We try not to apply any preprocessings here.
|
||||
|
||||
```python
|
||||
import csv
|
||||
import numpy as np
|
||||
import librosa
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
|
||||
from parakeet.data import DatasetMixin
|
||||
from parakeet.data import batch_spec, batch_wav
|
||||
|
||||
class LJSpeechMetaData(DatasetMixin):
|
||||
def __init__(self, root):
|
||||
self.root = Path(root)
|
||||
self._wav_dir = self.root.joinpath("wavs")
|
||||
csv_path = self.root.joinpath("metadata.csv")
|
||||
self._table = pd.read_csv(
|
||||
csv_path,
|
||||
sep="|",
|
||||
header=None,
|
||||
quoting=csv.QUOTE_NONE,
|
||||
names=["fname", "raw_text", "normalized_text"])
|
||||
|
||||
def get_example(self, i):
|
||||
fname, raw_text, normalized_text = self._table.iloc[i]
|
||||
fname = str(self._wav_dir.joinpath(fname + ".wav"))
|
||||
return fname, raw_text, normalized_text
|
||||
|
||||
def __len__(self):
|
||||
return len(self._table)
|
||||
```
|
||||
|
||||
We make this dataset simple in purpose. It requires only the path of the dataset, nothing more. It only loads the `metadata.csv` in the dataset when it is initialized, which includes file names of the audio files, and the transcriptions. We do not even load the audio files at `get_example()`.
|
||||
|
||||
Then we define a `Transform` object to transform an example of `LJSpeechMetaData` into an example we want for the model.
|
||||
|
||||
```python
|
||||
class Transform(object):
|
||||
def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels):
|
||||
self.sample_rate = sample_rate
|
||||
self.n_fft = n_fft
|
||||
self.win_length = win_length
|
||||
self.hop_length = hop_length
|
||||
self.n_mels = n_mels
|
||||
|
||||
def __call__(self, example):
|
||||
wav_path, _, _ = example
|
||||
|
||||
sr = self.sample_rate
|
||||
n_fft = self.n_fft
|
||||
win_length = self.win_length
|
||||
hop_length = self.hop_length
|
||||
n_mels = self.n_mels
|
||||
|
||||
wav, loaded_sr = librosa.load(wav_path, sr=None)
|
||||
assert loaded_sr == sr, "sample rate does not match, resampling applied"
|
||||
|
||||
# Pad audio to the right size.
|
||||
frames = int(np.ceil(float(wav.size) / hop_length))
|
||||
fft_padding = (n_fft - hop_length) // 2 # sound
|
||||
desired_length = frames * hop_length + fft_padding * 2
|
||||
pad_amount = (desired_length - wav.size) // 2
|
||||
|
||||
if wav.size % 2 == 0:
|
||||
wav = np.pad(wav, (pad_amount, pad_amount), mode='reflect')
|
||||
else:
|
||||
wav = np.pad(wav, (pad_amount, pad_amount + 1), mode='reflect')
|
||||
|
||||
# Normalize audio.
|
||||
wav = wav / np.abs(wav).max() * 0.999
|
||||
|
||||
# Compute mel-spectrogram.
|
||||
# Turn center to False to prevent internal padding.
|
||||
spectrogram = librosa.core.stft(
|
||||
wav,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
n_fft=n_fft,
|
||||
center=False)
|
||||
spectrogram_magnitude = np.abs(spectrogram)
|
||||
|
||||
# Compute mel-spectrograms.
|
||||
mel_filter_bank = librosa.filters.mel(sr=sr,
|
||||
n_fft=n_fft,
|
||||
n_mels=n_mels)
|
||||
mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude)
|
||||
mel_spectrogram = mel_spectrogram
|
||||
|
||||
# Rescale mel_spectrogram.
|
||||
min_level, ref_level = 1e-5, 20 # hard code it
|
||||
mel_spectrogram = 20 * np.log10(np.maximum(min_level, mel_spectrogram))
|
||||
mel_spectrogram = mel_spectrogram - ref_level
|
||||
mel_spectrogram = np.clip((mel_spectrogram + 100) / 100, 0, 1)
|
||||
|
||||
# Extract the center of audio that corresponds to mel spectrograms.
|
||||
audio = wav[fft_padding:-fft_padding]
|
||||
assert mel_spectrogram.shape[1] * hop_length == audio.size
|
||||
|
||||
# there is no clipping here
|
||||
return audio, mel_spectrogram
|
||||
```
|
||||
|
||||
`Transform` loads the audio files, and extracts `mel_spectrogram` from them. This transformation actually needs a lot of options to specify, namely, the sample rate of the audio files, the `n_fft`, `win_length`, `hop_length` of `stft` transformation, and `n_mels` for transforming spectrogram into mel_spectrogram. So we define it as a callable class. You can also use a closure, or a `partial` if you want to.
|
||||
|
||||
Then we defines a functor to batch examples into a batch. Because the two fields ( `audio` and `mel_spectrogram`) are both sequences, batching them is not trivial. Also, because the wavenet model trains in audio clips of a fixed length(0.5 seconds, for example), we have to truncate the audio when creating batches. We want to crop audio randomly when creating batches, instead of truncating them when preprocessing each example, because it allows for an audio to be truncated at different positions.
|
||||
|
||||
```python
|
||||
class DataCollector(object):
|
||||
def __init__(self,
|
||||
context_size,
|
||||
sample_rate,
|
||||
hop_length,
|
||||
train_clip_seconds,
|
||||
valid=False):
|
||||
frames_per_second = sample_rate // hop_length
|
||||
train_clip_frames = int(
|
||||
np.ceil(train_clip_seconds * frames_per_second))
|
||||
context_frames = context_size // hop_length
|
||||
self.num_frames = train_clip_frames + context_frames
|
||||
|
||||
self.sample_rate = sample_rate
|
||||
self.hop_length = hop_length
|
||||
self.valid = valid
|
||||
|
||||
def random_crop(self, sample):
|
||||
audio, mel_spectrogram = sample
|
||||
audio_frames = int(audio.size) // self.hop_length
|
||||
max_start_frame = audio_frames - self.num_frames
|
||||
assert max_start_frame >= 0, "audio is too short to be cropped"
|
||||
|
||||
frame_start = np.random.randint(0, max_start_frame)
|
||||
# frame_start = 0 # norandom
|
||||
frame_end = frame_start + self.num_frames
|
||||
|
||||
audio_start = frame_start * self.hop_length
|
||||
audio_end = frame_end * self.hop_length
|
||||
|
||||
audio = audio[audio_start:audio_end]
|
||||
return audio, mel_spectrogram, audio_start
|
||||
|
||||
def __call__(self, samples):
|
||||
# transform them first
|
||||
if self.valid:
|
||||
samples = [(audio, mel_spectrogram, 0)
|
||||
for audio, mel_spectrogram in samples]
|
||||
else:
|
||||
samples = [self.random_crop(sample) for sample in samples]
|
||||
# batch them
|
||||
audios = [sample[0] for sample in samples]
|
||||
audio_starts = [sample[2] for sample in samples]
|
||||
mels = [sample[1] for sample in samples]
|
||||
|
||||
mels = batch_spec(mels)
|
||||
|
||||
if self.valid:
|
||||
audios = batch_wav(audios, dtype=np.float32)
|
||||
else:
|
||||
audios = np.array(audios, dtype=np.float32)
|
||||
audio_starts = np.array(audio_starts, dtype=np.int64)
|
||||
return audios, mels, audio_starts
|
||||
```
|
||||
|
||||
When these 3 components are defined, we can start building our dataset with them.
|
||||
|
||||
```python
|
||||
# building the ljspeech dataset
|
||||
ljspeech_meta = LJSpeechMetaData(root)
|
||||
transform = Transform(sample_rate, n_fft, win_length, hop_length, n_mels)
|
||||
ljspeech = TransformDataset(ljspeech_meta, transform)
|
||||
|
||||
# split them into train and valid dataset
|
||||
ljspeech_valid = SliceDataset(ljspeech, 0, valid_size)
|
||||
ljspeech_train = SliceDataset(ljspeech, valid_size, len(ljspeech))
|
||||
|
||||
# building batch functions (they can be differnt for training and validation if you need it)
|
||||
train_batch_fn = DataCollector(context_size, sample_rate, hop_length,
|
||||
train_clip_seconds)
|
||||
valid_batch_fn = DataCollector(
|
||||
context_size, sample_rate, hop_length, train_clip_seconds, valid=True)
|
||||
|
||||
# building the data cargo
|
||||
train_cargo = DataCargo(
|
||||
ljspeech_train,
|
||||
train_batch_fn,
|
||||
batch_size,
|
||||
sampler=RandomSampler(ljspeech_train))
|
||||
|
||||
valid_cargo = DataCargo(
|
||||
ljspeech_valid,
|
||||
valid_batch_fn,
|
||||
batch_size=1, # only batch=1 for validation is enabled
|
||||
sampler=SequentialSampler(ljspeech_valid))
|
||||
```
|
||||
|
||||
Here comes the next question, how to bring batches into Paddle's computation. Do we need some adapter to transform numpy.ndarray into Paddle's native Variable type? Yes.
|
||||
|
||||
First we can use `var = dg.to_variable(array)` to transform ndarray into Variable.
|
||||
|
||||
```python
|
||||
for batch in train_cargo:
|
||||
audios, mels, audio_starts = batch
|
||||
audios = dg.to_variable(audios)
|
||||
mels = dg.to_variable(mels)
|
||||
audio_starts = dg.to_variable(audio_starts)
|
||||
|
||||
# your training code here
|
||||
```
|
||||
|
||||
In the code above, processing of the data and training of the model run in the same process. So the next batch starts to load after the training of the current batch has finished. There is actually better solutions for this. Data processing and model training can be run asynchronously. To accomplish this, we would use `DataLoader` from Paddle. This serves as an adapter to transform an iterable object of batches into another iterable object of batches, which runs asynchronously and transform each ndarray into `Variable`.
|
||||
|
||||
```python
|
||||
# connect our data cargos with corresponding DataLoader
|
||||
# now the data cargo is connected with paddle
|
||||
with dg.guard(place):
|
||||
train_loader = fluid.io.DataLoader.from_generator(
|
||||
capacity=10,return_list=True).set_batch_generator(train_cargo, place)
|
||||
valid_loader = fluid.io.DataLoader.from_generator(
|
||||
capacity=10, return_list=True).set_batch_generator(valid_cargo, place)
|
||||
|
||||
# iterate over the dataloader
|
||||
for batch in train_loader:
|
||||
audios, mels, audio_starts = batch
|
||||
# your trains cript here
|
||||
```
|
|
@ -1,87 +0,0 @@
|
|||
# How to build your own model and experiment?
|
||||
|
||||
For a general deep learning experiment, there are 4 parts to care for.
|
||||
|
||||
1. Preprocess dataset to meet the needs for model training and iterate over them in batches;
|
||||
2. Define the model and the optimizer;
|
||||
3. Write the training process (including forward-backward computation, parameter update, logging, evaluation, etc.)
|
||||
4. Configure and launch the experiment.
|
||||
|
||||
## Data Processing
|
||||
|
||||
For processing data, `parakeet.data` provides `DatasetMixin`, `DataCargo` and `DataIterator`.
|
||||
|
||||
Dataset is an iterable object of examples. `DatasetMixin` provides the standard indexing interface, and other classes in [parakeet.data.dataset](../parakeet/data/dataset.py) provide flexible interfaces for building customized datasets.
|
||||
|
||||
`DataCargo` is an iterable object of batches. It differs from a dataset in that it can be iterated over in batches. In addition to a dataset, a `Sampler` and a `batch function` are required to build a `DataCargo`. `Sampler` specifies which examples to pick, and `batch function` specifies how to create a batch from them. Commonly used `Samplers` are provided by [parakeet.data](../parakeet/data/). Users should define a `batch function` for a datasets, in order to batch its examples.
|
||||
|
||||
`DataIterator` is an iterator class for `DataCargo`. It is create when explicitly creating an iterator of a `DataCargo` by `iter(DataCargo)`, or iterating over a `DataCargo` with `for` loop.
|
||||
|
||||
Data processing is splited into two phases: sample-level processing and batching.
|
||||
|
||||
1. Sample-level processing. This process is transforming an example into another. This process can be defined as `get_example()` method of a dataset, or as a `transform` (callable object) and build a `TransformDataset` with it.
|
||||
|
||||
2. Batching. It is the process of transforming a list of examples into a batch. The rationale is to transform an array of structures into a structure of arrays. We generally define a batch function (or a callable object) to do this.
|
||||
|
||||
To connect a `DataCargo` with Paddlepaddle's asynchronous data loading mechanism, we need to create a `fluid.io.DataLoader` and connect it to the `Datacargo`.
|
||||
|
||||
The overview of data processing in an experiment with Parakeet is :
|
||||
|
||||
```text
|
||||
Dataset --(transform)--> Dataset --+
|
||||
sampler --+
|
||||
batch_fn --+-> DataCargo --> DataLoader
|
||||
```
|
||||
|
||||
The user need to define a customized transform and a batch function to accomplish this process. See [data](./data.md) for more details.
|
||||
|
||||
## Model
|
||||
|
||||
Parakeet provides commonly used functions, modules and models for the users to define their own models. Functions contain no trainable `Parameter`s, and are used in modules and models. Modules and modes are subclasses of `fluid.dygraph.Layer`. The distinction is that `module`s tend to be generic, simple and highly reusable, while `model`s tend to be task-sepcific, complicated and not that reusable. Some models are so complicated that we extract building blocks from it as separate classes but if these building blocks are not common and reusable enough, they are considered as submodels.
|
||||
|
||||
In the structure of the project, modules are placed in [parakeet.modules](../parakeet/modules/), while models are in [parakeet.models](../parakeet/models) and grouped into folders like `waveflow` and `wavenet`, which include the whole model and their submodels.
|
||||
|
||||
When developers want to add new models to `parakeet`, they can consider the distinctions described above and put the code in an appropriate place.
|
||||
|
||||
|
||||
|
||||
## Training Process
|
||||
|
||||
Training process is basically running a training loop for multiple times. A typical training loop consists of the procedures below:
|
||||
|
||||
1. Iterating over training dataset;
|
||||
2. Prerocessing mini-batches;
|
||||
3. Forward/backward computations of the neural networks;
|
||||
4. Updating Parameters;
|
||||
5. Evaluating the model on validation dataset;
|
||||
6. Logging or saving intermediate results;
|
||||
7. Saving checkpoints of the model and the optimizer.
|
||||
|
||||
In section `DataProcessing` we have cover 1 and 2.
|
||||
|
||||
`Model` and `Optimizer` cover 3 and 4.
|
||||
|
||||
To keep the training loop clear, it's a good idea to define functions for saving/loading of checkpoints, evaluation on validation set, logging and saving of intermediate results, etc. For some complicated model, it is also recommended to define a function to create the model. This function can be used in both train and inference, to ensure that the model is identical at training and inference.
|
||||
|
||||
Code is typically organized in this way:
|
||||
|
||||
```text
|
||||
├── configs/ (example configuration)
|
||||
├── data.py (definition of custom Dataset, transform and batch function)
|
||||
├── README.md (README for the experiment)
|
||||
├── synthesis.py (code for inference)
|
||||
├── train.py (code for training)
|
||||
└── utils.py (all other utility functions)
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Deep learning experiments have many options to configure. These configurations can be roughly grouped into different types: configurations about path of the dataset and path to save results, configurations about how to process data, configurations about the model and configurations about the training process.
|
||||
|
||||
Some configurations tend to change when running the code at different times, for example, path of the data and path to save results and whether to load model before training, etc. For these configurations, it's better to define them as command line arguments. We use `argparse` to handle them.
|
||||
|
||||
Other groups of configurations may overlap with others. For example, data processing and model may have some common options. The recommended way is to save them as configuration files, for example, `yaml` or `json`. We prefer `yaml`, for it is more human-reabable.
|
||||
|
||||
|
||||
|
||||
There are several examples in this repo, check [Parakeet/examples](../examples) for more details. `Parakeet/examples` is where we place our experiments. Though experiments are not a part of package `parakeet`, it is a part of repo `Parakeet`. They are provided as examples and allow for the users to run our experiment out-of-the-box. Feel free to add new examples and contribute to `Parakeet`.
|
|
@ -0,0 +1,35 @@
|
|||
@ECHO OFF
|
||||
|
||||
pushd %~dp0
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=sphinx-build
|
||||
)
|
||||
set SOURCEDIR=source
|
||||
set BUILDDIR=build
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.http://sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
goto end
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
|
||||
:end
|
||||
popd
|
|
@ -0,0 +1 @@
|
|||
paddlepaddle==2.0.0.rc1
|
|
@ -0,0 +1,155 @@
|
|||
======================
|
||||
Advanced Usage
|
||||
======================
|
||||
|
||||
This sections covers how to extend parakeet by implementing your own models and
|
||||
experiments. Guidelines on implementation are also elaborated.
|
||||
|
||||
Model
|
||||
-------------
|
||||
|
||||
As a common practice with paddlepaddle, models are implemented as subclasses
|
||||
of ``paddle.nn.Layer``. Models could be simple, like a single layer RNN. For
|
||||
complicated models, it is recommended to split the model into different
|
||||
components.
|
||||
|
||||
For a encoder-decoder model, it is natural to split it into the encoder and
|
||||
the decoder. For a model composed of several similar layers, it is natural to
|
||||
extract the sublayer as a separate layer.
|
||||
|
||||
There are two common ways to define a model which consists of several modules.
|
||||
|
||||
#. Define a module given the specifications. Here is an example with multilayer
|
||||
perceptron.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
class MLP(nn.Layer):
|
||||
def __init__(self, input_size, hidden_size, output_size):
|
||||
self.linear1 = nn.Linear(input_size, hidden_size)
|
||||
self.linear2 = nn.Linear(hidden_size, output_size)
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear2(paddle.tanh(self.linear1(x))
|
||||
|
||||
module = MLP(16, 32, 4) # intialize a module
|
||||
|
||||
When the module is intended to be a generic and reusable layer that can be
|
||||
integrated into a larger model, we prefer to define it in this way.
|
||||
|
||||
For considerations of readability and usability, we strongly recommend
|
||||
**NOT** to pack specifications into a single object. Here's an example below.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
class MLP(nn.Layer):
|
||||
def __init__(self, hparams):
|
||||
self.linear1 = nn.Linear(hparams.input_size, hparams.hidden_size)
|
||||
self.linear2 = nn.Linear(hparams.hidden_size, hparams.output_size)
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear2(paddle.tanh(self.linear1(x))
|
||||
|
||||
For a module defined in this way, it's harder for the user to initialize an
|
||||
instance. Users have to read the code to check what attributes are used.
|
||||
|
||||
Also, code in this style tend to be abused by passing a huge config object
|
||||
to initialize every module used in an experiment, thought each module may
|
||||
not need the whole configuration.
|
||||
|
||||
We prefer to be explicit.
|
||||
|
||||
#. Define a module as a combination given its components. Here is an example
|
||||
for a sequence-to-sequence model.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
class Seq2Seq(nn.Layer):
|
||||
def __init__(self, encoder, decoder):
|
||||
self.encoder = encoder
|
||||
self.decoder = decoder
|
||||
|
||||
def forward(self, x):
|
||||
encoder_output = self.encoder(x)
|
||||
output = self.decoder(encoder_output)
|
||||
return output
|
||||
|
||||
encoder = Encoder(...)
|
||||
decoder = Decoder(...)
|
||||
model = Seq2Seq(encoder, decoder) # compose two components
|
||||
|
||||
When a model is a complicated and made up of several components, each of which
|
||||
has a separate functionality, and can be replaced by other components with the
|
||||
same functionality, we prefer to define it in this way.
|
||||
|
||||
Data
|
||||
-------------
|
||||
|
||||
Another critical componnet for a deep learning project is data. As a common
|
||||
practice, we use the dataset and dataloader abstraction.
|
||||
|
||||
Dataset
|
||||
^^^^^^^^^^
|
||||
Dataset is the representation of a set of examples used by a project. In most of
|
||||
the cases, dataset is a collection of examples. Dataset is an object which has
|
||||
methods below.
|
||||
|
||||
#. ``__len__``, to get the size of the dataset.
|
||||
#. ``__getitem__``, to get an example by key or index.
|
||||
|
||||
Examples is a record consisting of several fields. In practice, we usually
|
||||
represent it as a namedtuple for convenience, yet dict and user-defined object
|
||||
are also supported.
|
||||
|
||||
We define our own dataset by subclassing ``paddle.io.Dataset``.
|
||||
|
||||
DataLoader
|
||||
^^^^^^^^^^^
|
||||
In deep learning practice, models are trained with minibatches. DataLoader
|
||||
meets the need for iterating the dataset in batches. It is done by providing
|
||||
a sampler and a batch function in addition to a dataset.
|
||||
|
||||
#. sampler, sample indices or keys used to get examples from the dataset.
|
||||
#. batch function, transform a list of examples into a batch.
|
||||
|
||||
An commonly used sampler is ``RandomSampler``, it shuffles all the valid
|
||||
indices and then iterate over them sequentially. ``DistributedBatchSampler`` is
|
||||
a sampler used for distributed data parallel training, when the sampler handles
|
||||
data sharding in a dynamic way.
|
||||
|
||||
Batch function is used to transform selected examples into a batch. For a simple
|
||||
case where an example is composed of several fields, each of which is represented
|
||||
by an fixed size array, batch function can be simply stacking each field. For
|
||||
cases where variable size arrays are included in the example, batching could
|
||||
invlove padding and stacking. While in theory, batch function can do more like
|
||||
randomly slicing, etc.
|
||||
|
||||
For a custom dataset used for a custom model, it is required to define a batch
|
||||
function for it.
|
||||
|
||||
Config
|
||||
-------------
|
||||
|
||||
It's common to change the running configuration to compare results. To keep track
|
||||
of running configuration, we use ``yaml`` configuration files.
|
||||
|
||||
Also, we want to interact with command line options. Some options that usually
|
||||
change according to running environments is provided by command line arguments.
|
||||
In addition, we want to override an option in the config file without editing
|
||||
it.
|
||||
|
||||
Taking these requirements in to consideration, we use `yacs <https://github.com/rbgirshick/yacs>`_
|
||||
as a config management tool. Other tools like `omegaconf <https://github.com/omry/omegaconf>`_
|
||||
are also powerful and have similar functions.
|
||||
|
||||
In each example provided, there is a ``config.py``, where the default config is
|
||||
defined. If you want to get the default config, import ``config.py`` and call
|
||||
``get_cfg_defaults()`` to get the default config. Then it can be updated with
|
||||
yaml config file or command line arguments if needed.
|
||||
|
||||
For details about how to use yacs in experiments, see `yacs <https://github.com/rbgirshick/yacs>`_.
|
||||
|
||||
|
||||
Experiment
|
||||
--------------
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
===========
|
||||
Basic Usage
|
||||
===========
|
||||
|
||||
This section shows how to use pretrained models provided by parakeet and make
|
||||
inference with them.
|
||||
|
||||
Pretrained models are provided in a archive. Extract it to get a folder like
|
||||
this::
|
||||
|
||||
checkpoint_name/
|
||||
├──config.yaml
|
||||
└──step-310000.pdparams
|
||||
|
||||
The ``config.yaml`` stores the config used to train the model, the
|
||||
``step-N.pdparams`` is the parameter file, where N is the steps it has been
|
||||
trained.
|
||||
|
||||
The example code below shows how to use the models for prediction.
|
||||
|
||||
text to spectrogram
|
||||
^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The code below show how to use a transformer_tts model. After loading the
|
||||
pretrained model, use ``model.predict(sentence)`` to generate spectrograms
|
||||
(in numpy.ndarray format), which can be further used to synthesize raw audio
|
||||
with a vocoder.
|
||||
|
||||
>>> import parakeet
|
||||
>>> from parakeet.frontend import English
|
||||
>>> from parakeet.models import TransformerTTS
|
||||
>>> from pathlib import Path
|
||||
>>> import yacs
|
||||
>>>
|
||||
>>> # load the pretrained model
|
||||
>>> frontend = English()
|
||||
>>> checkpoint_dir = Path("transformer_tts_pretrained")
|
||||
>>> config = yacs.config.CfgNode.load_cfg(str(checkpoint_dir / "config.yaml"))
|
||||
>>> checkpoint_path = str(checkpoint_dir / "step-310000")
|
||||
>>> model = TransformerTTS.from_pretrained(
|
||||
>>> frontend, config, checkpoint_path)
|
||||
>>> model.eval()
|
||||
>>>
|
||||
>>> # text to spectrogram
|
||||
>>> sentence = "Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition"
|
||||
>>> outputs = model.predict(sentence, verbose=args.verbose)
|
||||
>>> mel_output = outputs["mel_output"]
|
||||
|
||||
vocoder
|
||||
^^^^^^^^^^
|
||||
|
||||
Like the example above, after loading the pretrained ``ConditionalWaveFlow``
|
||||
model, call ``model.predict(mel)`` to synthesize raw audio (in wav format).
|
||||
|
||||
>>> import soundfile as df
|
||||
>>> from parakeet.models import ConditionalWaveFlow
|
||||
>>>
|
||||
>>> # load the pretrained model
|
||||
>>> checkpoint_dir = Path("waveflow_pretrained")
|
||||
>>> config = yacs.config.CfgNode.load_cfg(str(checkpoint_dir / "config.yaml"))
|
||||
>>> checkpoint_path = str(checkpoint_dir / "step-2000000")
|
||||
>>> vocoder = ConditionalWaveFlow.from_pretrained(config, checkpoint_path)
|
||||
>>> vocoder.eval()
|
||||
>>>
|
||||
>>> # synthesize
|
||||
>>> audio = vocoder.predict(mel_output)
|
||||
>>> sf.write(audio_path, audio, config.data.sample_rate)
|
||||
|
||||
For more details on how to use the model, please refer the documentation.
|
|
@ -0,0 +1,78 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#
|
||||
import os
|
||||
import sys
|
||||
sys.path.insert(0, os.path.abspath('../..'))
|
||||
autodoc_mock_imports = ["soundfile", "librosa"]
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = 'parakeet'
|
||||
copyright = '2020, parakeet-developers'
|
||||
author = 'parakeet-developers'
|
||||
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = '0.2'
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.viewcode',
|
||||
"sphinx_rtd_theme",
|
||||
'sphinx.ext.mathjax',
|
||||
'numpydoc',
|
||||
'sphinx.ext.autosummary',
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path.
|
||||
exclude_patterns = []
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ['_static']
|
||||
source_suffix = ['.rst', '.md']
|
||||
|
||||
# -- Extension configuration -------------------------------------------------
|
||||
numpydoc_show_class_members = False
|
|
@ -0,0 +1,187 @@
|
|||
Audio Sample
|
||||
==================
|
||||
|
||||
TTS audio samples
|
||||
-------------------
|
||||
|
||||
Audio samples generated by a TTS system. Text is first transformed into spectrogram
|
||||
by a text-to-spectrogram model, then the spectrogram is converted into raw audio by
|
||||
a vocoder.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<embed>
|
||||
<table>
|
||||
<tr>
|
||||
<th align="left"> TransformerTTS + WaveFlow</th>
|
||||
<th align="left"> Tacotron2 + WaveFlow </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_waveflow_samples_0.2/sentence_1.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_waveflow_samples_0.2/sentence_2.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_waveflow_samples_0.2/sentence_3.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_waveflow_samples_0.2/sentence_4.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_waveflow_samples_0.2/sentence_5.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_waveflow_samples_0.2/sentence_6.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_waveflow_samples_0.2/sentence_7.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_waveflow_samples_0.2/sentence_8.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_waveflow_samples_0.2/sentence_9.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_1.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_2.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_3.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_4.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_5.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_6.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_7.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_8.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_9.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
</tr>
|
||||
</tabel>
|
||||
</table>
|
||||
</embed>
|
||||
|
||||
|
||||
Vocoder audio samples
|
||||
--------------------------
|
||||
|
||||
Audio samples generated from ground-truth spectrograms with a vocoder.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<embed>
|
||||
<table>
|
||||
<tr>
|
||||
<th align="left"> WaveFlow res 128</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_0.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_1.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_2.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_3.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_4.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
</tr>
|
||||
</tabel>
|
||||
</table>
|
||||
</embed>
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
==============================
|
||||
Design of Parakeet
|
||||
==============================
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
.. parakeet documentation master file, created by
|
||||
sphinx-quickstart on Thu Dec 17 20:01:34 2020.
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
Parakeet
|
||||
====================================
|
||||
|
||||
``parakeet`` is a deep learning based text-to-speech toolkit built upon ``paddlepaddle`` framework. It aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It includes many influential TTS models proposed by `Baidu Research <http://research.baidu.com>`_ and other research groups.
|
||||
|
||||
``parakeet`` mainly consists of components below.
|
||||
|
||||
#. Implementation of models and commonly used neural network layers.
|
||||
#. Dataset abstraction and common data preprocessing pipelines.
|
||||
#. Ready-to-run experiments.
|
||||
|
||||
.. toctree::
|
||||
:caption: Getting started
|
||||
:maxdepth: 1
|
||||
|
||||
install
|
||||
basic
|
||||
advanced
|
||||
|
||||
.. toctree::
|
||||
:caption: Demos
|
||||
:maxdepth: 1
|
||||
|
||||
demo
|
||||
|
||||
.. toctree::
|
||||
:caption: Design of Parakeet
|
||||
:maxdepth: 1
|
||||
|
||||
design
|
||||
|
||||
.. toctree::
|
||||
:caption: Documentation
|
||||
:maxdepth: 1
|
||||
|
||||
parakeet.audio
|
||||
parakeet.data
|
||||
parakeet.datasets
|
||||
parakeet.frontend
|
||||
parakeet.modules
|
||||
parakeet.models
|
||||
parakeet.training
|
||||
parakeet.utils
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
* :ref:`genindex`
|
||||
* :ref:`modindex`
|
||||
* :ref:`search`
|
|
@ -0,0 +1,83 @@
|
|||
=============
|
||||
Installation
|
||||
=============
|
||||
|
||||
|
||||
Install PaddlePaddle
|
||||
------------------------
|
||||
Parakeet requires PaddlePaddle as its backend. Note that 2.0.0rc1 or newer versions
|
||||
of paddle is required.
|
||||
|
||||
Since paddlepaddle has multiple packages depending on the device (cpu or gpu)
|
||||
and the dependency libraries, it is recommended to install a proper package of
|
||||
paddlepaddle with respect to the device and dependency library versons via
|
||||
pip.
|
||||
|
||||
Installing paddlepaddle with conda or build paddlepaddle from source is also
|
||||
supported. Please refer to `PaddlePaddle installation <https://www.paddlepaddle.org.cn/install/quick/)>`_ for more details.
|
||||
|
||||
Example instruction to install paddlepaddle via pip is listed below.
|
||||
|
||||
**PaddlePaddle with gpu**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python -m pip install paddlepaddle-gpu==2.0.0rc1.post101 -f https://paddlepaddle.org.cn/whl/stable.html
|
||||
python -m pip install paddlepaddle-gpu==2.0.0rc1.post100 -f https://paddlepaddle.org.cn/whl/stable.html
|
||||
|
||||
|
||||
**PaddlePaddle with cpu**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python -m pip install paddlepaddle==2.0.0rc1 -i https://mirror.baidu.com/pypi/simple
|
||||
|
||||
|
||||
Install libsndfile
|
||||
-------------------
|
||||
|
||||
Experimemts in parakeet often involve audio and spectrum processing, thus
|
||||
``librosa`` and ``soundfile`` are required. ``soundfile`` requires a extra
|
||||
C library ``libsndfile``, which is not always handled by pip.
|
||||
|
||||
For windows and mac users, ``libsndfile`` is also installed when installing
|
||||
``soundfile`` via pip, but for linux users, installing ``libsndfile`` via
|
||||
system package manager is required. Example commands for popular distributions
|
||||
are listed below.
|
||||
|
||||
.. code-block::
|
||||
|
||||
# ubuntu, debian
|
||||
sudo apt-get install libsndfile1
|
||||
|
||||
# centos, fedora
|
||||
sudo yum install libsndfile
|
||||
|
||||
# openSUSE
|
||||
sudo zypper in libsndfile
|
||||
|
||||
For any problem with installtion of soundfile, please refer to
|
||||
`SoundFile <https://pypi.org/project/SoundFile>`_.
|
||||
|
||||
Install Parakeet
|
||||
------------------
|
||||
|
||||
There are two ways to install parakeet according to the purpose of using it.
|
||||
|
||||
#. If you want to run experiments provided by parakeet or add new models and
|
||||
experiments, it is recommended to clone the project from github
|
||||
(`Parakeet <https://github.com/PaddlePaddle/Parakeet>`_), and install it in
|
||||
editable mode.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
git clone https://github.com/PaddlePaddle/Parakeet
|
||||
cd Parakeet
|
||||
pip install -e .
|
||||
|
||||
#. If you only need to use the models for inference by parakeet, install from
|
||||
pypi is recommended.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install paddle-parakeet
|
|
@ -0,0 +1,7 @@
|
|||
parakeet
|
||||
========
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
parakeet
|
|
@ -0,0 +1,29 @@
|
|||
parakeet.audio package
|
||||
======================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
parakeet.audio.audio module
|
||||
---------------------------
|
||||
|
||||
.. automodule:: parakeet.audio.audio
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.audio.spec\_normalizer module
|
||||
--------------------------------------
|
||||
|
||||
.. automodule:: parakeet.audio.spec_normalizer
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: parakeet.audio
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
|
@ -0,0 +1,29 @@
|
|||
parakeet.data package
|
||||
=====================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
parakeet.data.batch module
|
||||
--------------------------
|
||||
|
||||
.. automodule:: parakeet.data.batch
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.data.dataset module
|
||||
----------------------------
|
||||
|
||||
.. automodule:: parakeet.data.dataset
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: parakeet.data
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
|
@ -0,0 +1,29 @@
|
|||
parakeet.datasets package
|
||||
=========================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
parakeet.datasets.common module
|
||||
-------------------------------
|
||||
|
||||
.. automodule:: parakeet.datasets.common
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.datasets.ljspeech module
|
||||
---------------------------------
|
||||
|
||||
.. automodule:: parakeet.datasets.ljspeech
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: parakeet.datasets
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
|
@ -0,0 +1,53 @@
|
|||
parakeet.frontend.normalizer package
|
||||
====================================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
parakeet.frontend.normalizer.abbrrviation module
|
||||
------------------------------------------------
|
||||
|
||||
.. automodule:: parakeet.frontend.normalizer.abbrrviation
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.frontend.normalizer.acronyms module
|
||||
--------------------------------------------
|
||||
|
||||
.. automodule:: parakeet.frontend.normalizer.acronyms
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.frontend.normalizer.normalizer module
|
||||
----------------------------------------------
|
||||
|
||||
.. automodule:: parakeet.frontend.normalizer.normalizer
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.frontend.normalizer.numbers module
|
||||
-------------------------------------------
|
||||
|
||||
.. automodule:: parakeet.frontend.normalizer.numbers
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.frontend.normalizer.width module
|
||||
-----------------------------------------
|
||||
|
||||
.. automodule:: parakeet.frontend.normalizer.width
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: parakeet.frontend.normalizer
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
|
@ -0,0 +1,45 @@
|
|||
parakeet.frontend package
|
||||
=========================
|
||||
|
||||
Subpackages
|
||||
-----------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
parakeet.frontend.normalizer
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
parakeet.frontend.phonectic module
|
||||
----------------------------------
|
||||
|
||||
.. automodule:: parakeet.frontend.phonectic
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.frontend.punctuation module
|
||||
------------------------------------
|
||||
|
||||
.. automodule:: parakeet.frontend.punctuation
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.frontend.vocab module
|
||||
------------------------------
|
||||
|
||||
.. automodule:: parakeet.frontend.vocab
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: parakeet.frontend
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
|
@ -0,0 +1,38 @@
|
|||
parakeet.models package
|
||||
=======================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
parakeet.models.tacotron2 module
|
||||
--------------------------------
|
||||
|
||||
.. automodule:: parakeet.models.tacotron2
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.models.transformer\_tts module
|
||||
---------------------------------------
|
||||
|
||||
.. automodule:: parakeet.models.transformer_tts
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.models.waveflow module
|
||||
-------------------------------
|
||||
|
||||
.. automodule:: parakeet.models.waveflow
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: parakeet.models
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
|
@ -0,0 +1,77 @@
|
|||
parakeet.modules package
|
||||
========================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
parakeet.modules.attention module
|
||||
---------------------------------
|
||||
|
||||
.. automodule:: parakeet.modules.attention
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.modules.audio module
|
||||
-----------------------------
|
||||
|
||||
.. automodule:: parakeet.modules.audio
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.modules.conv module
|
||||
----------------------------
|
||||
|
||||
.. automodule:: parakeet.modules.conv
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.modules.geometry module
|
||||
--------------------------------
|
||||
|
||||
.. automodule:: parakeet.modules.geometry
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.modules.losses module
|
||||
------------------------------
|
||||
|
||||
.. automodule:: parakeet.modules.losses
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.modules.masking module
|
||||
-------------------------------
|
||||
|
||||
.. automodule:: parakeet.modules.masking
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.modules.positional\_encoding module
|
||||
--------------------------------------------
|
||||
|
||||
.. automodule:: parakeet.modules.positional_encoding
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.modules.transformer module
|
||||
-----------------------------------
|
||||
|
||||
.. automodule:: parakeet.modules.transformer
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: parakeet.modules
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
|
@ -0,0 +1,25 @@
|
|||
parakeet package
|
||||
================
|
||||
|
||||
Subpackages
|
||||
-----------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
parakeet.audio
|
||||
parakeet.data
|
||||
parakeet.datasets
|
||||
parakeet.frontend
|
||||
parakeet.models
|
||||
parakeet.modules
|
||||
parakeet.training
|
||||
parakeet.utils
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: parakeet
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
|
@ -0,0 +1,37 @@
|
|||
parakeet.training package
|
||||
=========================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
parakeet.training.cli module
|
||||
----------------------------
|
||||
|
||||
.. automodule:: parakeet.training.cli
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.training.default\_config module
|
||||
----------------------------------------
|
||||
|
||||
.. automodule:: parakeet.training.default_config
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.training.experiment module
|
||||
-----------------------------------
|
||||
|
||||
.. automodule:: parakeet.training.experiment
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: parakeet.training
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
|
@ -0,0 +1,61 @@
|
|||
parakeet.utils package
|
||||
======================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
parakeet.utils.checkpoint module
|
||||
--------------------------------
|
||||
|
||||
.. automodule:: parakeet.utils.checkpoint
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.utils.display module
|
||||
-----------------------------
|
||||
|
||||
.. automodule:: parakeet.utils.display
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.utils.internals module
|
||||
-------------------------------
|
||||
|
||||
.. automodule:: parakeet.utils.internals
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.utils.layer\_tools module
|
||||
----------------------------------
|
||||
|
||||
.. automodule:: parakeet.utils.layer_tools
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.utils.mp\_tools module
|
||||
-------------------------------
|
||||
|
||||
.. automodule:: parakeet.utils.mp_tools
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
parakeet.utils.scheduler module
|
||||
-------------------------------
|
||||
|
||||
.. automodule:: parakeet.utils.scheduler
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: parakeet.utils
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
|
@ -0,0 +1,104 @@
|
|||
# 实验配置
|
||||
|
||||
本节主要讲述 parakeet 的推荐的配置实验的方式,以及我们做出这样的选择的原因。
|
||||
|
||||
## 配置选项的内容
|
||||
|
||||
深度学习实验常常有很多选项可配置。这些配置大概可以被分为几类:
|
||||
|
||||
1. 数据源以及数据处理方式配置;
|
||||
2. 实验结果保存路径配置;
|
||||
3. 数据预处理方式配置;
|
||||
4. 模型结构和超参数配置;
|
||||
5. 训练过程配置。
|
||||
|
||||
虽然这些配置之间也可能存在某些重叠项,比如数据预处理部分的配置可能就和模型配置有关。比如说 mel 频谱的维数,既可以理解为模型配置的一部分,也可以理解为数据处理配置的一部分。但大体上,配置文件是可以分成几个部分的。
|
||||
|
||||
## 常见配置文件格式
|
||||
|
||||
常见的配置文件的格式有 `ini`, `yaml`, `toml`, `json` 等。
|
||||
|
||||
`ini`
|
||||
优点:简单,支持字符串插值等操作。
|
||||
缺点:仅支持两层结构,值不带类型信息,解析的时候需要手动 cast。
|
||||
|
||||
`yaml`
|
||||
优点:格式简洁,值有类型,解析的时候一般不需手动 cast,支持写注释。
|
||||
缺点:语法规范复杂。
|
||||
|
||||
`toml`
|
||||
和 yaml 类似
|
||||
|
||||
`json`
|
||||
优点:格式简单,
|
||||
缺点:标记符号太多,可读性不佳,手写也容易出错。不支持注释。
|
||||
|
||||
出于语言本身的表达能力和可读性,我们选择 yaml, 但我们会尽可能使配置文件简单。
|
||||
|
||||
1. 类型上,只使用字符串,整数,浮点数,布尔值;
|
||||
2. 结构嵌套上,尽可能只使用两层或更浅的结构。
|
||||
|
||||
## 配置选项和命令行参数处理
|
||||
|
||||
对于深度学习实验,有部分配置是经常会发生改变的,比如数据源以及保存实验结果的路径,或者加载的 checkpoint 的路径等。对于这些配置,更好的做法是把它们实现为命令行参数。
|
||||
|
||||
其余的不经常发生变动的参数,推荐将其写在配置文件中,我们推荐使用 `yaml` 作为配置文件,因为它允许添加注释,并且更加人类可读。
|
||||
|
||||
当然把所有的选项都有 argparse 来处理也可以,但是对于选项丰富的深度学习实验来说,都使用 argparse 会导致代码异常冗长。
|
||||
|
||||
但是需要注意的是,同时使用配置文件和命令行解析工具的时候,如果不做特殊处理,配置文件所支持的选项并不能显示在 argparse.ArgumentParser 的 usage 和 help 信息里。这主要是配置文件解析和 argparse 在设计上的一些固有的差异导致的。
|
||||
|
||||
通过一些手段把配置所支持的选项附加到 ArgumentParser 固然可以弥补这点,但是这会存在一些默认值的优先级哪一方更高的问题,是默认配置的优先级更高,比如还是 ArgumentParser 中的默认值优先级更高。
|
||||
|
||||
因此我们选择不把配置所支持的选项附加到 ArgumentParser,而是分开处理两部分。
|
||||
|
||||
## 实践
|
||||
|
||||
我们选择 yacs 搭配 argparse 作为配置解析工具,为 argparse 命令行新增一个选项 `--config` 来传入配置文件。yacs 有几个特点:
|
||||
|
||||
1. 支持 yaml 格式的配置文件(亦即支持配置层级嵌套以及有类型的值);
|
||||
2. 支持 config 的增量覆盖,以及由命令行参数覆盖配置文件等灵活的操作;
|
||||
3. 支持 `.key` 递归访问属性,比字典式的 `["key"]` 方便;
|
||||
|
||||
我们推荐把默认的配置写成 python 代码(examples 中的每个例子都有一个 config.py,里面提供了默认的配置,并且带有注释)。而如果用户需要覆盖部分配置,则仅需要提供想要覆盖的部分配置即可,而不必提供一个完整的配置文件。这么做的考虑是:
|
||||
|
||||
1. 仅提供需要覆盖的选项也是许多软件配置的标准方式。
|
||||
2. 对于同一个模型的两次实验,往往仅仅只有很少的配置发生变化,仅提供增量的配置比提供完整的配置更容易让用户看出两次实验的配置差异。
|
||||
3. 运行脚本的时候可以不传 `--config` 参数,而以默认配置运行实验,简化运行脚本。
|
||||
|
||||
当新增实验的时候,可以参考 examples 里的例子来写默认配置文件。
|
||||
|
||||
除了可以通过 `--config` 命令行参数来指定用于覆盖的配置文件。另外,我们还可以通过新增一个 `--opts` 选项来接收 ArgumentParser 解析到的剩余命令行参数。这些参数将被用于进一步覆盖配置。使用方式是 `--opts key1 value1 key2 value2 ...`,即以空格分割键和值,比如`--opts training.lr 0.001 model.encoder_layers 4`。其中的键是配置中的键名,对于嵌套的选项,其键名以 `.` 连接。
|
||||
|
||||
## 默认的 ArgumentParser
|
||||
|
||||
我们提供了默认的 ArgumentParser(参考 `parakeet/training/cli.py`), 它实现了上述的功能。它包含极简的命令行选项,只有 `--config`, `--data`, `--output`, `--checkpoint_path`, `--device`, `--nprocs` 和 `--opts` 选项。
|
||||
|
||||
这是一个深度学习基本都需要的一些命令行选项,因此当新增实验的时候,可以直接使用这个 ArgumentParser,当有超出这个范围的命令行选项时,也可以再继续新增。
|
||||
|
||||
1. `--config` 和 `--opts` 用于支持配置文件解析,而配置文件本身处理了每个实验特有的选项;
|
||||
2. `--data` 和 `--output` 分别是数据集的路径和训练结果的保存路径(包含 checkpoints/ 文件夹,文本输出结果以及可视化输出结果);
|
||||
3. `--checkpoint_path` 用于在训练前加载某个 checkpoint, 当需要从某个特定的 checkpoint 加载继续训练。另外,在不传 `--checkpoint_path` 的情况下,如果 `--output` 下的 checkpoints/ 文件夹中包含了训练的结果,则默认会加载其中最新的 checkpoint 继续训练。
|
||||
4. `--device` 和 `--nprocs` 指定了运行方式,`--device` 指定运行设备类型,是在 cpu 还是 gpu 上运行。`--nprocs` 指的是用多少个进程训练,如果 `nprocs` > 1 则意味着使用多进程并行训练。(注:目前只支持 gpu 多卡多进程训练。)
|
||||
|
||||
使用帮助信息如下:
|
||||
|
||||
```text
|
||||
usage: train.py [-h] [--config FILE] [--data DATA_DIR] [--output OUTPUT_DIR]
|
||||
[--checkpoint_path CHECKPOINT_PATH] [--device {cpu,gpu}]
|
||||
[--nprocs NPROCS] [--opts ...]
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config FILE path of the config file to overwrite to default config
|
||||
with.
|
||||
--data DATA_DIR path to the datatset.
|
||||
--output OUTPUT_DIR path to save checkpoint and log. If not provided, a
|
||||
directory is created in runs/ to save outputs.
|
||||
--checkpoint_path CHECKPOINT_PATH
|
||||
path of the checkpoint to load
|
||||
--device {cpu,gpu} device type to use, cpu and gpu are supported.
|
||||
--nprocs NPROCS number of parallel processes to use.
|
||||
--opts ... options to overwrite --config file and the default
|
||||
config, passing in KEY VALUE pairs
|
||||
```
|
|
@ -0,0 +1,216 @@
|
|||
# 数据准备
|
||||
|
||||
本节主要讲述 `parakeet.data` 子模块的设计以及如何在实验中使用它。
|
||||
|
||||
`parakeet.data` 遵循 paddle 管用的数据准备流程。Dataset, Sampler, batch function, DataLoader.
|
||||
|
||||
## Dataset
|
||||
|
||||
我们假设数据集是样例的列表。你可以通过 `__len__` 方法获取其长度,并且可以通过 `__getitem__` 方法随机访问其元素。有了上述两个调节,我们也可以用 `iter(dataset)` 来获得一个 dataset 的迭代器。我们一般通过继承 `paddle.io.Dataset` 来创建自己的数据集。为其实现 `__len__` 方法和 `__getitem__` 方法即可。
|
||||
|
||||
出于数据处理,数据加载和数据集大小等方面的考虑,可以采用集中策略来调控数据集是否被懒惰地预处理,是否被懒惰地被加载,是否常驻内存等。
|
||||
|
||||
1. 数据在数据集实例化的时候被全部预处理并常驻内存。对于数据预处理比较快,且整个数据集较小的情况,可以采用这样的策略。因为整个的数据集的预处理在数据集实例化时完成,因此要求预处理很快,否则将要花时间等待数据集实例化。因为被处理后的数据集常驻内存,因此要求数据集较小,否则可能不能将整个数据集加载进内存。
|
||||
2. 每个样例在被请求的时候预处理,并且把预处理的结果缓存。可以通过在数据集的 `__getitem__` 方法中调用单条样例的预处理方法来实现这个策略。这样做的条件一样是数据可以整个载入内存。但好处是不必花费很多时间等待数据集实例化。使用这个策略,则数据集被完整迭代一次之后,访问样例的时候会显著变快,因为不需要再次处理。但在首次使用的时候仍然会需要即时处理,所以如果快速评估数据迭代的数度还需要等数据集被迭代一遍。
|
||||
3. 先将数据集预处理一遍把结果保存下来。再作为另一个数据集使用,这个新的数据集的 `__getitem__` 方法则只是从存储器读取数据。一般来说数据读取的性能并不会制约模型的训练,并且这也不要求内存必须足以装下整个数据集。是一种较为灵活的方法。但是会需要一个单独的预处理脚本,并且根据处理后的数据写一个数据集。
|
||||
|
||||
以上的三种只是一种概念上的划分,实际使用时候我们可能混用以上的策略。举例如下:
|
||||
|
||||
1. 对于一个样例的多个字段,有的是很小的,比如说文本,可能可能常驻内存;而对于音频,频谱或者图像,可能预先处理并存储,在访问时仅加载处理好的结果。
|
||||
2. 对于某些比较大或者预处理比较慢的数据集。我们可以仅加载一个较小的元数据,里面包含了一些可以用于对样例进行排序或者筛选的特征码,则我们可以在不加载整个样例就可以利用这些元数据对数据进行排序或者筛选。
|
||||
|
||||
一般来说,我们将一个 Dataset 的子类看作是数据集和实验的具体需求之间的适配器。
|
||||
|
||||
parakeet 还提供了若干个高阶的 Dataset 类,用于从已有的 Dataset 产生新的 Dataset.
|
||||
|
||||
1. 用于字段组合的有 TupleDataset, DictDataset;
|
||||
2. 用于数据集切分合并的有 SliceDataset, SubsetDataset, ChainDataset;
|
||||
3. 用于缓存数据集的有 CacheDataset;
|
||||
4. 用于数据集筛选的有 FilterDataset;
|
||||
5. 用于变换数据集的有 TransformDataset.
|
||||
|
||||
可以灵活地使用这些高阶数据集来使数据处理更加灵活。
|
||||
|
||||
## DataLoader
|
||||
|
||||
`DataLoader` 类似 `Dataset` 也是可迭代对象,但是一般情况下,它是按批量来迭代的。在深度学习中我们需要 `DataLoader` 是因为把多个样例组成一个批次可以充分利用现代硬件的计算资源。可以根据一个 Dataset 构建一个 DataLoader,它可以被多次迭代。
|
||||
|
||||
构建 DataLoader 除了需要一个 Dataset 之外,还需要两个要素。
|
||||
|
||||
1. 如何组成批次。
|
||||
2. 如何选取样例来组成批次;
|
||||
|
||||
下面的两个小节将分别提供这两个要素。
|
||||
|
||||
### batch function
|
||||
|
||||
批次是包含多个样例的列表经过某种变换的结果。假设一个样例是一个拥有多个字段的结构(在不同的编程语言可能有不同的实现,比如在 python 中可以是 tuple, dict 等,在 C/C++ 中可能是一个 struct)。那么包含多个样例的列表就是一个结构的阵列(array of structure, AOS). 而出于训练神经网络的需要,我们希望一个批次和一个样例一样,是拥有多个字段的一个结构。因此需要一个方法,把一个结构的阵列(array of structures)变成一个阵列的结构(structure of arrays).
|
||||
|
||||
下面是一个简单的例子:
|
||||
|
||||
下面的表格代表了两个样例,每个包含 5 个字段。
|
||||
|
||||
| weight | height | width | depth | density |
|
||||
| ------ | ------ | ----- | ----- | ------- |
|
||||
| 1.2 | 1.1 | 1.3 | 1.4 | 0.8 |
|
||||
| 1.6 | 1.4 | 1.2 | 0.6 | 1.4 |
|
||||
|
||||
以上表格的 AOS 表示形式和 SOA 表示形式如下:
|
||||
|
||||
AOS:
|
||||
|
||||
```text
|
||||
[(1.2, 1,1, 1,3, 1,4, 0.8),
|
||||
|
||||
(1.6, 1.4, 1.2, 0.6, 1.4)]
|
||||
```
|
||||
|
||||
SOA:
|
||||
|
||||
```text
|
||||
([1,2, 1.6],
|
||||
[1.1, 1.4],
|
||||
[1.3, 1.2],
|
||||
[1.4, 0.6],
|
||||
[0.8, 1.4])
|
||||
```
|
||||
|
||||
对于上述的例子,将 AOS 转换为 SOA 是平凡的。只要把所有样例的各个字段 stack 起来就可以。但事情并非总是如此简单。当一个字段包含一个序列,你可能就需要先把所有的序列都补长 (pad) 到最长的序列长度,然后才能把它们 stack 起来。对于某些情形,批次可能比样例多一些字段,比如说对于包含序列的样例,在补长之后,可能需要增设一个字段来记录那些字段的有效长度。因此,一般情况下,需要一个函数来实现这个功能,而且这是和这个数据集搭配的。当然除了函数之外,也可以使用任何的可调用对象,我们把这些称为 batch function.
|
||||
|
||||
|
||||
### Sampler
|
||||
|
||||
有了 batch function(我们知道如何组成批次), 接下来是另一个问题,将什么组成批次呢?当组建一个批次的时候,我们需要决定选取那些样例来组成它。因此我们预设数据集是可以随机访问的,我们只需要选取对应的索引即可。我们使用 sampler 来完成选取 index 的任务。
|
||||
|
||||
Sampler 被实现为产生整数的可迭代对象。假设数据集有 `N` 个样例,那么产生 `[0, N)` 之间的整数的迭代器就是一个合适的迭代器。最常用的 sampler 是 `SequentialSampler` 和 `RandomSampler`.
|
||||
|
||||
当迭代一个 DataLoader 的时候,首先 sampler 产生多个 index, 然后根据这些 index 去取出对应的样例,并调用 batch function 把这些样例组成一个批次。当然取出样例的过程是可并行的,但调用 batch function 组成 batch 不是。
|
||||
|
||||
另外的一种选择是使用 batch sampler, 它是产生整数列表的可迭代对象。对于一般的 sampler, 需要对其迭代器使用 next 多次才能产出多个 index, 而对于 batch sampler, 对其迭代器使用 next 一次就可以产出多个 index. 对于使用一般的 sampler 的情形,batch size 由 DataLoader 的来决定。而对于 batch sampler, 则是由它决定了 DataLoader 的 batch size, 因此可以用它来实现一些特别的需求,比如说动态 batch size.
|
||||
|
||||
## 示例代码
|
||||
|
||||
以下是我们使用 `parakeet.data` 处理 `LJSpeech` 数据集的代码。
|
||||
|
||||
首先,我们定义一个 class 来代表 LJspeech 数据集,它只是如其所是地加载了元数据,亦即数据集中的 `metadata.csv` 文件,其中记录了音频文件的文件名,以及转录文本。但并不加载音频,也并不做任何的预处理。我们有意让这个数据集保持简单,它仅需要数据集的路径来实例化。
|
||||
|
||||
```python
|
||||
import csv
|
||||
import numpy as np
|
||||
import librosa
|
||||
from pathlib import Path
|
||||
from paddle.io import Dataset
|
||||
|
||||
from parakeet.data import batch_spec, batch_wav
|
||||
|
||||
class LJSpeechMetaData(Dataset):
|
||||
def __init__(self, root):
|
||||
self.root = Path(root).expanduser()
|
||||
wav_dir = self.root / "wavs"
|
||||
csv_path = self.root / "metadata.csv"
|
||||
records = []
|
||||
speaker_name = "ljspeech"
|
||||
with open(str(csv_path), 'rt') as f:
|
||||
for line in f:
|
||||
filename, _, normalized_text = line.strip().split("|")
|
||||
filename = str(wav_dir / (filename + ".wav"))
|
||||
records.append([filename, normalized_text, speaker_name])
|
||||
self.records = records
|
||||
|
||||
def __getitem__(self, i):
|
||||
return self.records[i]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.records)
|
||||
```
|
||||
|
||||
然后我们定义一个 `Transform` 类,用于处理 `LJSpeechMetaData` 中的样例,将其转换为模型所需要的数据。对于不同的模型可以定义不同的 Transform,这样就可以共用 `LJSpeechMetaData` 的代码。
|
||||
|
||||
```python
|
||||
from parakeet.audio import AudioProcessor
|
||||
from parakeet.audio import LogMagnitude
|
||||
from parakeet.frontend import English
|
||||
|
||||
class Transform(object):
|
||||
def __init__(self):
|
||||
self.frontend = English()
|
||||
self.processor = AudioProcessor(
|
||||
sample_rate=22050,
|
||||
n_fft=1024,
|
||||
win_length=1024,
|
||||
hop_length=256,
|
||||
f_max=8000)
|
||||
self.normalizer = LogMagnitude()
|
||||
|
||||
def forward(self, record):
|
||||
fname, text, _ = meta_data:
|
||||
wav = processor.read_wav(fname)
|
||||
mel = processor.mel_spectrogram(wav)
|
||||
mel = normalizer.transform(mel)
|
||||
phonemes = frontend.phoneticize(text)
|
||||
ids = frontend.numericalize(phonemes)
|
||||
mel_name = os.path.splitext(os.path.basename(fname))[0]
|
||||
stop_probs = np.ones([mel.shape[1]], dtype=np.int64)
|
||||
stop_probs[-1] = 2
|
||||
return (ids, mel, stop_probs)
|
||||
```
|
||||
|
||||
`Transform` 加载音频,并且提取频谱。把 `Transform` 实现为一个可调用的类可以方便地持有许多选项,比如和傅里叶变换相关的参数。这里可以把一个 `LJSpeechMetaData` 对象和一个 `Transform` 对象组合起来,创建一个 `TransformDataset`.
|
||||
|
||||
```python
|
||||
from parakeet.data import TransformDataset
|
||||
|
||||
meta = LJSpeechMetaData(data_path)
|
||||
transform = Transform()
|
||||
ljspeech = TransformDataset(meta, transform)
|
||||
```
|
||||
|
||||
当然也可以选择专门写一个转换脚本把转换后的数据集保存下来,然后再写一个适配的 Dataset 子类去加载这些保存的数据。实际这么做的效率会更高。
|
||||
|
||||
接下来我们需要写一个可调用对象将多个样例组成批次。因为其中的 ids 和 mel 频谱是序列数据,所以我们需要进行 padding.
|
||||
|
||||
```python
|
||||
class LJSpeechCollector(object):
|
||||
"""A simple callable to batch LJSpeech examples."""
|
||||
def __init__(self, padding_idx=0, padding_value=0.):
|
||||
self.padding_idx = padding_idx
|
||||
self.padding_value = padding_value
|
||||
|
||||
def __call__(self, examples):
|
||||
ids = [example[0] for example in examples]
|
||||
mels = [example[1] for example in examples]
|
||||
stop_probs = [example[2] for example in examples]
|
||||
|
||||
ids = batch_text_id(ids, pad_id=self.padding_idx)
|
||||
mels = batch_spec(mels, pad_value=self.padding_value)
|
||||
stop_probs = batch_text_id(stop_probs, pad_id=self.padding_idx)
|
||||
return ids, np.transpose(mels, [0, 2, 1]), stop_probs
|
||||
```
|
||||
|
||||
以上的组件准备就绪后,可以准备整个数据流。
|
||||
|
||||
```python
|
||||
def create_dataloader(source_path, valid_size, batch_size):
|
||||
lj = LJSpeechMeta(source_path)
|
||||
transform = Transform()
|
||||
lj = TransformDataset(lj, transform)
|
||||
|
||||
valid_set, train_set = dataset.split(lj, valid_size)
|
||||
train_loader = DataLoader(
|
||||
train_set,
|
||||
return_list=False,
|
||||
batch_size=batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=LJSpeechCollector())
|
||||
valid_loader = DataLoader(
|
||||
valid_set,
|
||||
return_list=False,
|
||||
batch_size=batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=LJSpeechCollector())
|
||||
return train_loader, valid_loader
|
||||
```
|
||||
|
||||
train_loader 和 valid_loader 可以被迭代。对其迭代器使用 next, 返回的是 `paddle.Tensor` 的 list, 代表一个 batch,这些就可以直接用作 `paddle.nn.Layer` 的输入了。
|
|
@ -0,0 +1,75 @@
|
|||
# 实验流程
|
||||
|
||||
实验中有不少细节需要注意,比如模型的保存和加载,定期进行验证,文本 log 和 可视化 log,保存配置文件等,另外对于不同的运行方式还有额外的处理,这些代码可能比较繁琐,但是对于追踪代码变化对结果的影响以及 debug 都非常重要。为了减少写这部分代码的成本,我们提供了不少通用的辅助代码,比如用于保存和加载,以及可视化的代码,可供实验代码直接使用。
|
||||
|
||||
而对于整个实验过程,我们提供了一个 ExperimentBase 类,它是在模型和实验开发的过程抽象出来的训练过程模板,可以作为具体实验的基类使用。相比 chainer 中的 Trainer 以及 keras 中的 Model.fit 而言,ExperimentBase 是一个相对低层级的 API。它是作为基类来使用,用户仍然需要实现整个训练过程,也因此可以自由控制许多东西;而不是作为一种组合方式来使用,用户只需要提供模型,数据集,评价指标等就能自动完成整个训练过程。
|
||||
|
||||
前者的方式并不能节省很多代码量,只是以一种标准化的方式来组织代码。后者的方式虽然能够节省许多代码量,但是把如何组成整个训练过程的方式对用户隐藏了。如果需要为标准的训练过程添加一些自定义行为,则必须通过 extension/hook 等方式来实现,在一些固定的时点加入一些自定义行为(比如 iteration 开始、结束时,epoch 开始、结束时,整个训练流程开始、结束时)。
|
||||
|
||||
通过 extension/hook 之类的方式来为训练流程加入自定义行为,往往存在一些 access 的限制。extension/hook 一般是通过 callable 的形式来实现,但是这个 callable 可访问的变量往往是有限的,比如说只能访问 model, optimzier, dataloader, iteration, epoch, metric 等,如果需要访问其他的中间变量,则往往比较麻烦。
|
||||
|
||||
此外,组合式的使用方式往往对几个组件之间传输数据的协议有一些预设。一个常见的预设是:dataloader 产生的 batch 即是 model 的输入。在简单的情况下,这样大抵是没有问题的,但是也存在一些可能,模型需要除了 batch 之外的输入。令一个常见的预设是:criterion 仅需要 model 的 input 和 output 就能计算 loss, 但这么做其实存在 overkill 的可能,某些情况下,不需要 input 和 output 的全部字段就能计算 loss,如果为了满足协议而把 criterion 的接口设计成一样的,存在输出不必要的参数的问题。
|
||||
|
||||
## ExperimentBase 的设计
|
||||
|
||||
因此我们选择了低层次的接口,用户仍然可以自由操作训练过程,而只是对训练过程做了粗粒度的抽象。可以参考 [ExperimentBase](parakeet/training/experiment.py) 的代码。
|
||||
|
||||
继承 ExperimentBase 写作自己的实验类的时候,需要遵循一下的一些规范:
|
||||
|
||||
1. 包含 `.model`, `.optimizer`, `.train_loader`, `.valid_loader`, `.config`, `.args` 等属性。
|
||||
2. 配置需要包含一个 `.training` 字段, 其中包含 `valid_interval`, `save_interval` 和 `max_iteration` 几个键. 它们被用作触发验证,保存 checkpoint 以及停止训练的条件。
|
||||
3. 需要实现四个方法 `train_batch`, `valid`, `setup_model` and `setup_dataloader`。`train_batch` 是在一个 batch 的过程,`valid` 是在整个验证数据集上执行一次验证的过程,`setup_model` 是初始化 model 和 optimizer 的过程,其他的模型构建相关的代码也可以放在这里,`setup_dataloader` 是 train_loader 和 valid_loader 的构建过程。
|
||||
|
||||
实验的初始化过程如下, 包含了创建模型,优化器,数据迭代器,准备输出目录,logger 和可视化,保存配置的工作,除了 `setup_dataloader` 和 `self.setup_model` 需要自行实现,其他的几个方法都已有标准的实现。
|
||||
|
||||
```python
|
||||
def __init__(self, config, args):
|
||||
self.config = config
|
||||
self.args = args
|
||||
|
||||
def setup(self):
|
||||
paddle.set_device(self.args.device)
|
||||
if self.parallel:
|
||||
self.init_parallel()
|
||||
|
||||
self.setup_output_dir()
|
||||
self.dump_config()
|
||||
self.setup_visualizer()
|
||||
self.setup_logger()
|
||||
self.setup_checkpointer()
|
||||
|
||||
self.setup_dataloader()
|
||||
self.setup_model()
|
||||
|
||||
self.iteration = 0
|
||||
self.epoch = 0
|
||||
```
|
||||
|
||||
使用的时候只要一下的代码即可配置好一次实验:
|
||||
|
||||
```python
|
||||
exp = Experiment(config, args)
|
||||
exp.setup()
|
||||
```
|
||||
|
||||
整个训练流程可以表示如下:
|
||||
|
||||
```python
|
||||
def train(self):
|
||||
self.new_epoch()
|
||||
while self.iteration < self.config.training.max_iteration:
|
||||
self.iteration += 1
|
||||
self.train_batch()
|
||||
|
||||
if self.iteration % self.config.training.valid_interval == 0:
|
||||
self.valid()
|
||||
|
||||
if self.iteration % self.config.training.save_interval == 0:
|
||||
self.save()
|
||||
```
|
||||
|
||||
使用时只需要执行如下代码即可开始实验。
|
||||
|
||||
```python
|
||||
exp.run()
|
||||
```
|
|
@ -0,0 +1,74 @@
|
|||
# 如何准备自己的实验
|
||||
|
||||
对于一般的深度学习实验,有几个部分需要处理。
|
||||
|
||||
1. 按照模型的需要对数据进行预处理,并且按批次迭代数据集;
|
||||
2. 定义模型以及优化器等组件;
|
||||
3. 写出训练过程(一般包括 forward/backward 计算,参数更新,log 记录,可视化,定期评估等步骤);
|
||||
4. 配置并运行实验。
|
||||
|
||||
## 数据处理
|
||||
|
||||
对于数据处理,`parakeet.data` 采用了 paddlepaddle 常用的 `Dataset -> DataLoader` 的流程。数据处理流程的概览如下:
|
||||
|
||||
```text
|
||||
Dataset --(transform)--> Dataset --+
|
||||
sampler --+
|
||||
batch_fn --+-> DataLoader
|
||||
```
|
||||
|
||||
其中 transform 代表的是对样例的预处理。可以使用 `parakeet.data` 中的 TransformDataset 来从一个 Dataset 构建另一个 Dataset.
|
||||
|
||||
得到想要的 Dataset 之后,提供 sampler 和 batch function, 即可据此构建 DataLoader. DataLoader 产生的结果可以直接用作模型的输入。
|
||||
|
||||
详细的使用方式参见 [data_cn](./data_cn.md).
|
||||
|
||||
## 模型
|
||||
|
||||
为了对模型的可复用行和功能做较好的平衡,我们把模型按照其特征分为几种。
|
||||
|
||||
对于较为常用,可以作为其他更大的模型的部分的模块,我们尽可能将其实现得足够简单和通用,因为它们会被复用。对于含有可训练参数的模块,一般实现为 `paddle.nn.Layer` 的子类,但它们不是直接面向一个任务,因此不会带上处理未加工的输入和输出的功能。对于不含有可训练参数的模块,可以直接实现为一个函数,其输入输出都是 `paddle.Tensor` 或其集合。
|
||||
|
||||
针对一个特定任务的开箱模型,一般实现为 `paddle.nn.Layer` 的子类,是一个任务的核心计算单元。为了方便地处理输入和输出,一般还可以为它添加处理未加工的输入输出的功能。比如对于 NLP 任务来说,尽管神经网络接受的输出是文本的 id, 但是为了使模型能够处理未加工的输入,文本预处理的功能,以及文本转 id 的字典,也都应该视作模型的一部分。
|
||||
|
||||
当一个模型足够复杂,对其进行模块化切分是更好的选择,尽管拆分出来的小模块的功能也不一定非常通用,可能只是用于某个模型,但是当作么做有利于代码的清晰简洁时,仍然推荐这么做。
|
||||
|
||||
在 parakeet 的目录结构中,复用性较高的模块被放在 [parakeet.modules](../parakeet/modules/), 但是针对特定任务的模型则放在 [parakeet.models](../parakeet/models).
|
||||
|
||||
当开发新的模型的时候,开发这需要考虑拆分模块的可行性,以及模块的通用程度,把它们分置于合适的目录。
|
||||
|
||||
## 配置实验
|
||||
|
||||
我们使用 yacs 和 argparse 分别处理配置文件解析和命令行参数解析。关于配置的推荐方式,参考 [实验配置](./config_cn.md).
|
||||
|
||||
## 训练流程
|
||||
|
||||
训练流程一般就是多次训练一个循环体。典型的循环体包含如下的过程:
|
||||
|
||||
1. 迭代数据集;
|
||||
2. 处理批次数据;
|
||||
3. 神经网络的 forward/backward 计算;
|
||||
4. 参数更新;
|
||||
5. 符合一定条件时,在验证数据集上评估模型;
|
||||
6. 写日志,可视化,以及在某些情况下保存必要的中间结果;
|
||||
7. 保存模型和优化器的状态。
|
||||
|
||||
`数据处理` 包含了数据集以及 batch_function 的定义, 模型和优化器包含了模型的 forward/backward 计算的定义。而在模型和数据都准备好了,我们需要把这些组织起来,完成实验代码。
|
||||
|
||||
训练流程的组装,可以参考 [实验流程](./experiment_cn.md).
|
||||
|
||||
## 实验模板
|
||||
|
||||
实验代码一般以如下的方式组织:
|
||||
|
||||
```text
|
||||
├── README.md (实验的帮助信息)
|
||||
├── config.py (默认配置)
|
||||
├── preprocess.py (数据预处理脚本)
|
||||
├── data.py (Dataset, batch_function 等的定义)
|
||||
├── synthesis.py (用于生成的代码)
|
||||
├── train.py (用于训练的代码)
|
||||
└── utils.py (其他必要的辅助函数)
|
||||
```
|
||||
|
||||
在这个软件源中包含了几个例子,可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户,可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。
|
|
@ -0,0 +1,63 @@
|
|||
=============
|
||||
安装
|
||||
=============
|
||||
|
||||
|
||||
安装 PaddlePaddle
|
||||
-------------------
|
||||
Parakeet 以 PaddlePaddle 作为其后端,因此依赖 PaddlePaddle,值得说明的是 Parakeet 要求 2.0 及以上版本的 PaddlePaddle。你可以通过 pip 安装。如果需要安装支持 gpu 版本的 PaddlePaddle,需要根据环境中的 cuda 和 cudnn 的版本来选择 wheel 包的版本。使用 conda 安装以及源码编译安装的方式请参考 `PaddlePaddle 快速安装 <https://www.paddlepaddle.org.cn/install/quick/)>`_.
|
||||
|
||||
**gpu 版 PaddlePaddle**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python -m pip install paddlepaddle-gpu==2.0.0rc1.post101 -f https://paddlepaddle.org.cn/whl/stable.html
|
||||
python -m pip install paddlepaddle-gpu==2.0.0rc1.post100 -f https://paddlepaddle.org.cn/whl/stable.html
|
||||
|
||||
|
||||
**cpu 版 PaddlePaddle**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python -m pip install paddlepaddle==2.0.0rc1 -i https://mirror.baidu.com/pypi/simple
|
||||
|
||||
|
||||
安装 libsndfile
|
||||
-------------------
|
||||
|
||||
因为 Parakeet 的实验中常常会需要用到和音频处理,以及频谱处理相关的功能,所以我们依赖 librosa 和 soundfile 进行音频处理。而 librosa 和 soundfile 依赖一个 C 的库 libsndfile, 因为这不是 python 的包,对于 windows 用户和 mac 用户,使用 pip 安装 soundfile 的时候,libsndfile 也会被安装。如果遇到问题也可以参考 `SoundFile <https://pypi.org/project/SoundFile>`_.
|
||||
|
||||
对于 linux 用户,需要使用系统的包管理器安装这个包,常见发行版上的命令参考如下。
|
||||
|
||||
|
||||
.. code-block::
|
||||
|
||||
# ubuntu, debian
|
||||
sudo apt-get install libsndfile1
|
||||
|
||||
# centos, fedora,
|
||||
sudo yum install libsndfile
|
||||
|
||||
# openSUSE
|
||||
sudo zypper in libsndfile
|
||||
|
||||
|
||||
安装 Parakeet
|
||||
------------------
|
||||
|
||||
我们提供两种方式来使用 Parakeet.
|
||||
|
||||
#. 需要运行 Parakeet 自带的实验代码,或者希望进行二次开发的用户,可以先从 github 克隆本工程,cd 仅工程目录,并进行可编辑式安装(不会被复制到 site-packages, 而且对工程的修改会立即生效,不需要重新安装),之后就可以使用了。
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# -e 表示可编辑式安装
|
||||
pip install -e .
|
||||
|
||||
|
||||
#. 仅需要使用我们提供的训练好的模型进行预测,那么也可以直接安装 pypi 上的 wheel 包的版本。
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install paddle-parakeet
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
# Parakeet 概览
|
||||
|
||||
<img src="../images/logo.png" alt="parakeet-logo" style="zoom: 33%;" />
|
||||
|
||||
Parakeet 旨在为开源社区提供一个灵活,高效,先进的语音合成工具箱。Parakeet 基于PaddlePaddle 2.0 构建,并且包含了百度研究院以及其他研究机构的许多有影响力的 TTS 模型。
|
||||
|
||||
Parakeet 为用户和开发者提供了
|
||||
|
||||
1. 可复用的模型以及常用的模块;
|
||||
2. 从数据处理,模型训练到预测等一系列过程的完整实验;
|
||||
3. 高质量的开箱即用模型。
|
|
@ -1,148 +0,0 @@
|
|||
# Clarinet
|
||||
|
||||
PaddlePaddle dynamic graph implementation of ClariNet, a convolutional network based vocoder. The implementation is based on the paper [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](arxiv.org/abs/1807.07281).
|
||||
|
||||
|
||||
## Dataset
|
||||
|
||||
We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
|
||||
|
||||
```bash
|
||||
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
|
||||
tar xjvf LJSpeech-1.1.tar.bz2
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
|
||||
```text
|
||||
├── data.py data_processing
|
||||
├── configs/ (example) configuration file
|
||||
├── synthesis.py script to synthesize waveform from mel_spectrogram
|
||||
├── train.py script to train a model
|
||||
└── utils.py utility functions
|
||||
```
|
||||
|
||||
## Saving & Loading
|
||||
`train.py` and `synthesis.py` have 3 arguments in common, `--checkpooint`, `iteration` and `output`.
|
||||
|
||||
1. `output` is the directory for saving results.
|
||||
During training, checkpoints are saved in `checkpoints/` in `output` and tensorboard log is save in `log/` in `output`. Other possible outputs are saved in `states/` in `outuput`.
|
||||
During synthesizing, audio files and other possible outputs are save in `synthesis/` in `output`.
|
||||
So after training and synthesizing with the same output directory, the file structure of the output directory looks like this.
|
||||
|
||||
```text
|
||||
├── checkpoints/ # checkpoint directory (including *.pdparams, *.pdopt and a text file `checkpoint` that records the latest checkpoint)
|
||||
├── states/ # audio files generated at validation and other possible outputs
|
||||
├── log/ # tensorboard log
|
||||
└── synthesis/ # synthesized audio files and other possible outputs
|
||||
```
|
||||
|
||||
2. `--checkpoint` and `--iteration` for loading from existing checkpoint. Loading existing checkpoiont follows the following rule:
|
||||
If `--checkpoint` is provided, the checkpoint specified by `--checkpoint` is loaded.
|
||||
If `--checkpoint` is not provided, we try to load the model specified by `--iteration` from the checkpoint directory. If `--iteration` is not provided, we try to load the latested checkpoint from checkpoint directory.
|
||||
|
||||
## Train
|
||||
|
||||
Train the model using train.py, follow the usage displayed by `python train.py --help`.
|
||||
|
||||
```text
|
||||
usage: train.py [-h] [--config CONFIG] [--device DEVICE] [--data DATA]
|
||||
[--checkpoint CHECKPOINT | --iteration ITERATION]
|
||||
[--wavenet WAVENET]
|
||||
output
|
||||
|
||||
Train a ClariNet model with LJspeech and a trained WaveNet model.
|
||||
|
||||
positional arguments:
|
||||
output path to save experiment results
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG path of the config file
|
||||
--device DEVICE device to use
|
||||
--data DATA path of LJspeech dataset
|
||||
--checkpoint CHECKPOINT checkpoint to resume from
|
||||
--iteration ITERATION the iteration of the checkpoint to load from output directory
|
||||
--wavenet WAVENET wavenet checkpoint to use
|
||||
|
||||
- `--config` is the configuration file to use. The provided configurations can be used directly. And you can change some values in the configuration file and train the model with a different config.
|
||||
- `--device` is the device (gpu id) to use for training. `-1` means CPU.
|
||||
- `--data` is the path of the LJSpeech dataset, the extracted folder from the downloaded archive (the folder which contains `metadata.txt`).
|
||||
|
||||
- `--checkpoint` is the path of the checkpoint.
|
||||
- `--iteration` is the iteration of the checkpoint to load from output directory.
|
||||
- `output` is the directory to save results, all result are saved in this directory.
|
||||
|
||||
See [Saving-&-Loading](#Saving-&-Loading) for details of checkpoint loading.
|
||||
|
||||
- `--wavenet` is the path of the wavenet checkpoint to load.
|
||||
When you start training a ClariNet model without loading form a ClariNet checkpoint, you should have trained a WaveNet model with single Gaussian output distribution. Make sure the config of the teacher model matches that of the trained wavenet model.
|
||||
|
||||
Example script:
|
||||
|
||||
```bash
|
||||
python train.py
|
||||
--config=./configs/clarinet_ljspeech.yaml
|
||||
--data=./LJSpeech-1.1/
|
||||
--device=0
|
||||
--wavenet="wavenet-step-2000000"
|
||||
experiment
|
||||
```
|
||||
|
||||
You can monitor training log via tensorboard, using the script below.
|
||||
|
||||
```bash
|
||||
cd experiment/log
|
||||
tensorboard --logdir=.
|
||||
```
|
||||
|
||||
## Synthesis
|
||||
```text
|
||||
usage: synthesis.py [-h] [--config CONFIG] [--device DEVICE] [--data DATA]
|
||||
[--checkpoint CHECKPOINT | --iteration ITERATION]
|
||||
output
|
||||
|
||||
Synthesize audio files from mel spectrogram in the validation set.
|
||||
|
||||
positional arguments:
|
||||
output path to save the synthesized audio
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG path of the config file
|
||||
--device DEVICE device to use.
|
||||
--data DATA path of LJspeech dataset
|
||||
--checkpoint CHECKPOINT checkpoint to resume from
|
||||
--iteration ITERATION the iteration of the checkpoint to load from output directory
|
||||
```
|
||||
|
||||
- `--config` is the configuration file to use. You should use the same configuration with which you train you model.
|
||||
- `--device` is the device (gpu id) to use for training. `-1` means CPU.
|
||||
- `--data` is the path of the LJspeech dataset. In principle, a dataset is not needed for synthesis, but since the input is mel spectrogram, we need to get mel spectrogram from audio files.
|
||||
- `--checkpoint` is the checkpoint to load.
|
||||
- `--iteration` is the iteration of the checkpoint to load from output directory.
|
||||
- `output` is the directory to save synthesized audio. Audio file is saved in `synthesis/` in `output` directory.
|
||||
See [Saving-&-Loading](#Saving-&-Loading) for details of checkpoint loading.
|
||||
|
||||
|
||||
Example script:
|
||||
|
||||
```bash
|
||||
python synthesis.py \
|
||||
--config=./configs/clarinet_ljspeech.yaml \
|
||||
--data=./LJSpeech-1.1/ \
|
||||
--device=0 \
|
||||
--iteration=500000 \
|
||||
experiment
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```bash
|
||||
python synthesis.py \
|
||||
--config=./configs/clarinet_ljspeech.yaml \
|
||||
--data=./LJSpeech-1.1/ \
|
||||
--device=0 \
|
||||
--checkpoint="experiment/checkpoints/step-500000" \
|
||||
experiment
|
||||
```
|
|
@ -1,52 +0,0 @@
|
|||
data:
|
||||
batch_size: 8
|
||||
train_clip_seconds: 0.5
|
||||
sample_rate: 22050
|
||||
hop_length: 256
|
||||
win_length: 1024
|
||||
n_fft: 2048
|
||||
|
||||
n_mels: 80
|
||||
valid_size: 16
|
||||
|
||||
|
||||
conditioner:
|
||||
upsampling_factors: [16, 16]
|
||||
|
||||
teacher:
|
||||
n_loop: 10
|
||||
n_layer: 3
|
||||
filter_size: 2
|
||||
residual_channels: 128
|
||||
loss_type: "mog"
|
||||
output_dim: 3
|
||||
log_scale_min: -9
|
||||
|
||||
student:
|
||||
n_loops: [10, 10, 10, 10, 10, 10]
|
||||
n_layers: [1, 1, 1, 1, 1, 1]
|
||||
filter_size: 3
|
||||
residual_channels: 64
|
||||
log_scale_min: -7
|
||||
|
||||
stft:
|
||||
n_fft: 2048
|
||||
win_length: 1024
|
||||
hop_length: 256
|
||||
|
||||
loss:
|
||||
lmd: 4
|
||||
|
||||
train:
|
||||
learning_rate: 0.0005
|
||||
anneal_rate: 0.5
|
||||
anneal_interval: 200000
|
||||
gradient_max_norm: 100.0
|
||||
|
||||
checkpoint_interval: 1000
|
||||
eval_interval: 1000
|
||||
|
||||
max_iterations: 2000000
|
||||
|
||||
|
||||
|
|
@ -1,52 +0,0 @@
|
|||
data:
|
||||
batch_size: 8
|
||||
train_clip_seconds: 0.5
|
||||
sample_rate: 22050
|
||||
hop_length: 256
|
||||
win_length: 1024
|
||||
n_fft: 2048
|
||||
|
||||
n_mels: 80
|
||||
valid_size: 16
|
||||
|
||||
|
||||
conditioner:
|
||||
upsampling_factors: [16, 16]
|
||||
|
||||
teacher:
|
||||
n_loop: 10
|
||||
n_layer: 3
|
||||
filter_size: 2
|
||||
residual_channels: 128
|
||||
loss_type: "mog"
|
||||
output_dim: 3
|
||||
log_scale_min: -9
|
||||
|
||||
student:
|
||||
n_loops: [10, 10, 10, 10, 10, 10]
|
||||
n_layers: [1, 1, 1, 1, 1, 1]
|
||||
filter_size: 3
|
||||
residual_channels: 64
|
||||
log_scale_min: -7
|
||||
|
||||
stft:
|
||||
n_fft: 2048
|
||||
win_length: 1024
|
||||
hop_length: 256
|
||||
|
||||
loss:
|
||||
lmd: 4
|
||||
|
||||
train:
|
||||
learning_rate: 0.0005
|
||||
anneal_rate: 0.5
|
||||
anneal_interval: 200000
|
||||
gradient_max_norm: 100.0
|
||||
|
||||
checkpoint_interval: 1000
|
||||
eval_interval: 1000
|
||||
|
||||
max_iterations: 2000000
|
||||
|
||||
|
||||
|
|
@ -1,179 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import division
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import ruamel.yaml
|
||||
import random
|
||||
from tqdm import tqdm
|
||||
import pickle
|
||||
import numpy as np
|
||||
|
||||
import paddle.fluid.dygraph as dg
|
||||
from paddle import fluid
|
||||
fluid.require_version('1.8.0')
|
||||
|
||||
from parakeet.modules.weight_norm import WeightNormWrapper
|
||||
from parakeet.models.wavenet import WaveNet, UpsampleNet
|
||||
from parakeet.models.clarinet import STFT, Clarinet, ParallelWaveNet
|
||||
from parakeet.data import TransformDataset, SliceDataset, RandomSampler, SequentialSampler, DataCargo
|
||||
from parakeet.utils.layer_tools import summary, freeze
|
||||
from parakeet.utils import io
|
||||
|
||||
from utils import eval_model
|
||||
sys.path.append("../wavenet")
|
||||
from data import LJSpeechMetaData, Transform, DataCollector
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize audio files from mel spectrogram in the validation set."
|
||||
)
|
||||
parser.add_argument("--config", type=str, help="path of the config file")
|
||||
parser.add_argument(
|
||||
"--device", type=int, default=-1, help="device to use.")
|
||||
parser.add_argument("--data", type=str, help="path of LJspeech dataset")
|
||||
|
||||
g = parser.add_mutually_exclusive_group()
|
||||
g.add_argument("--checkpoint", type=str, help="checkpoint to resume from")
|
||||
g.add_argument(
|
||||
"--iteration",
|
||||
type=int,
|
||||
help="the iteration of the checkpoint to load from output directory")
|
||||
|
||||
parser.add_argument(
|
||||
"output",
|
||||
type=str,
|
||||
default="experiment",
|
||||
help="path to save the synthesized audio")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.config, 'rt') as f:
|
||||
config = ruamel.yaml.safe_load(f)
|
||||
|
||||
if args.device == -1:
|
||||
place = fluid.CPUPlace()
|
||||
else:
|
||||
place = fluid.CUDAPlace(args.device)
|
||||
|
||||
dg.enable_dygraph(place)
|
||||
|
||||
ljspeech_meta = LJSpeechMetaData(args.data)
|
||||
|
||||
data_config = config["data"]
|
||||
sample_rate = data_config["sample_rate"]
|
||||
n_fft = data_config["n_fft"]
|
||||
win_length = data_config["win_length"]
|
||||
hop_length = data_config["hop_length"]
|
||||
n_mels = data_config["n_mels"]
|
||||
train_clip_seconds = data_config["train_clip_seconds"]
|
||||
transform = Transform(sample_rate, n_fft, win_length, hop_length, n_mels)
|
||||
ljspeech = TransformDataset(ljspeech_meta, transform)
|
||||
|
||||
valid_size = data_config["valid_size"]
|
||||
ljspeech_valid = SliceDataset(ljspeech, 0, valid_size)
|
||||
ljspeech_train = SliceDataset(ljspeech, valid_size, len(ljspeech))
|
||||
|
||||
teacher_config = config["teacher"]
|
||||
n_loop = teacher_config["n_loop"]
|
||||
n_layer = teacher_config["n_layer"]
|
||||
filter_size = teacher_config["filter_size"]
|
||||
context_size = 1 + n_layer * sum([filter_size**i for i in range(n_loop)])
|
||||
print("context size is {} samples".format(context_size))
|
||||
train_batch_fn = DataCollector(context_size, sample_rate, hop_length,
|
||||
train_clip_seconds)
|
||||
valid_batch_fn = DataCollector(
|
||||
context_size, sample_rate, hop_length, train_clip_seconds, valid=True)
|
||||
|
||||
batch_size = data_config["batch_size"]
|
||||
train_cargo = DataCargo(
|
||||
ljspeech_train,
|
||||
train_batch_fn,
|
||||
batch_size,
|
||||
sampler=RandomSampler(ljspeech_train))
|
||||
|
||||
# only batch=1 for validation is enabled
|
||||
valid_cargo = DataCargo(
|
||||
ljspeech_valid,
|
||||
valid_batch_fn,
|
||||
batch_size=1,
|
||||
sampler=SequentialSampler(ljspeech_valid))
|
||||
|
||||
# conditioner(upsampling net)
|
||||
conditioner_config = config["conditioner"]
|
||||
upsampling_factors = conditioner_config["upsampling_factors"]
|
||||
upsample_net = UpsampleNet(upscale_factors=upsampling_factors)
|
||||
freeze(upsample_net)
|
||||
|
||||
residual_channels = teacher_config["residual_channels"]
|
||||
loss_type = teacher_config["loss_type"]
|
||||
output_dim = teacher_config["output_dim"]
|
||||
log_scale_min = teacher_config["log_scale_min"]
|
||||
assert loss_type == "mog" and output_dim == 3, \
|
||||
"the teacher wavenet should be a wavenet with single gaussian output"
|
||||
|
||||
teacher = WaveNet(n_loop, n_layer, residual_channels, output_dim, n_mels,
|
||||
filter_size, loss_type, log_scale_min)
|
||||
# load & freeze upsample_net & teacher
|
||||
freeze(teacher)
|
||||
|
||||
student_config = config["student"]
|
||||
n_loops = student_config["n_loops"]
|
||||
n_layers = student_config["n_layers"]
|
||||
student_residual_channels = student_config["residual_channels"]
|
||||
student_filter_size = student_config["filter_size"]
|
||||
student_log_scale_min = student_config["log_scale_min"]
|
||||
student = ParallelWaveNet(n_loops, n_layers, student_residual_channels,
|
||||
n_mels, student_filter_size)
|
||||
|
||||
stft_config = config["stft"]
|
||||
stft = STFT(
|
||||
n_fft=stft_config["n_fft"],
|
||||
hop_length=stft_config["hop_length"],
|
||||
win_length=stft_config["win_length"])
|
||||
|
||||
lmd = config["loss"]["lmd"]
|
||||
model = Clarinet(upsample_net, teacher, student, stft,
|
||||
student_log_scale_min, lmd)
|
||||
summary(model)
|
||||
|
||||
# load parameters
|
||||
if args.checkpoint is not None:
|
||||
# load from args.checkpoint
|
||||
iteration = io.load_parameters(model, checkpoint_path=args.checkpoint)
|
||||
else:
|
||||
# load from "args.output/checkpoints"
|
||||
checkpoint_dir = os.path.join(args.output, "checkpoints")
|
||||
iteration = io.load_parameters(
|
||||
model, checkpoint_dir=checkpoint_dir, iteration=args.iteration)
|
||||
assert iteration > 0, "A trained checkpoint is needed."
|
||||
|
||||
# make generation fast
|
||||
for sublayer in model.sublayers():
|
||||
if isinstance(sublayer, WeightNormWrapper):
|
||||
sublayer.remove_weight_norm()
|
||||
|
||||
# data loader
|
||||
valid_loader = fluid.io.DataLoader.from_generator(
|
||||
capacity=10, return_list=True)
|
||||
valid_loader.set_batch_generator(valid_cargo, place)
|
||||
|
||||
# the directory to save audio files
|
||||
synthesis_dir = os.path.join(args.output, "synthesis")
|
||||
if not os.path.exists(synthesis_dir):
|
||||
os.makedirs(synthesis_dir)
|
||||
|
||||
eval_model(model, valid_loader, synthesis_dir, iteration, sample_rate)
|
|
@ -1,243 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import division
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import ruamel.yaml
|
||||
import random
|
||||
from tqdm import tqdm
|
||||
import pickle
|
||||
import numpy as np
|
||||
from visualdl import LogWriter
|
||||
|
||||
import paddle.fluid.dygraph as dg
|
||||
from paddle import fluid
|
||||
fluid.require_version('1.8.0')
|
||||
|
||||
from parakeet.models.wavenet import WaveNet, UpsampleNet
|
||||
from parakeet.models.clarinet import STFT, Clarinet, ParallelWaveNet
|
||||
from parakeet.data import TransformDataset, SliceDataset, CacheDataset, RandomSampler, SequentialSampler, DataCargo
|
||||
from parakeet.utils.layer_tools import summary, freeze
|
||||
from parakeet.utils import io
|
||||
|
||||
from utils import make_output_tree, eval_model, load_wavenet
|
||||
|
||||
# import dataset from wavenet
|
||||
sys.path.append("../wavenet")
|
||||
from data import LJSpeechMetaData, Transform, DataCollector
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Train a ClariNet model with LJspeech and a trained WaveNet model."
|
||||
)
|
||||
parser.add_argument("--config", type=str, help="path of the config file")
|
||||
parser.add_argument("--device", type=int, default=-1, help="device to use")
|
||||
parser.add_argument("--data", type=str, help="path of LJspeech dataset")
|
||||
|
||||
g = parser.add_mutually_exclusive_group()
|
||||
g.add_argument("--checkpoint", type=str, help="checkpoint to resume from")
|
||||
g.add_argument(
|
||||
"--iteration",
|
||||
type=int,
|
||||
help="the iteration of the checkpoint to load from output directory")
|
||||
|
||||
parser.add_argument(
|
||||
"--wavenet", type=str, help="wavenet checkpoint to use")
|
||||
|
||||
parser.add_argument(
|
||||
"output",
|
||||
type=str,
|
||||
default="experiment",
|
||||
help="path to save experiment results")
|
||||
|
||||
args = parser.parse_args()
|
||||
with open(args.config, 'rt') as f:
|
||||
config = ruamel.yaml.safe_load(f)
|
||||
|
||||
if args.device == -1:
|
||||
place = fluid.CPUPlace()
|
||||
else:
|
||||
place = fluid.CUDAPlace(args.device)
|
||||
|
||||
dg.enable_dygraph(place)
|
||||
|
||||
print("Command Line args: ")
|
||||
for k, v in vars(args).items():
|
||||
print("{}: {}".format(k, v))
|
||||
|
||||
ljspeech_meta = LJSpeechMetaData(args.data)
|
||||
|
||||
data_config = config["data"]
|
||||
sample_rate = data_config["sample_rate"]
|
||||
n_fft = data_config["n_fft"]
|
||||
win_length = data_config["win_length"]
|
||||
hop_length = data_config["hop_length"]
|
||||
n_mels = data_config["n_mels"]
|
||||
train_clip_seconds = data_config["train_clip_seconds"]
|
||||
transform = Transform(sample_rate, n_fft, win_length, hop_length, n_mels)
|
||||
ljspeech = TransformDataset(ljspeech_meta, transform)
|
||||
|
||||
valid_size = data_config["valid_size"]
|
||||
ljspeech_valid = CacheDataset(SliceDataset(ljspeech, 0, valid_size))
|
||||
ljspeech_train = CacheDataset(
|
||||
SliceDataset(ljspeech, valid_size, len(ljspeech)))
|
||||
|
||||
teacher_config = config["teacher"]
|
||||
n_loop = teacher_config["n_loop"]
|
||||
n_layer = teacher_config["n_layer"]
|
||||
filter_size = teacher_config["filter_size"]
|
||||
context_size = 1 + n_layer * sum([filter_size**i for i in range(n_loop)])
|
||||
print("context size is {} samples".format(context_size))
|
||||
train_batch_fn = DataCollector(context_size, sample_rate, hop_length,
|
||||
train_clip_seconds)
|
||||
valid_batch_fn = DataCollector(
|
||||
context_size, sample_rate, hop_length, train_clip_seconds, valid=True)
|
||||
|
||||
batch_size = data_config["batch_size"]
|
||||
train_cargo = DataCargo(
|
||||
ljspeech_train,
|
||||
train_batch_fn,
|
||||
batch_size,
|
||||
sampler=RandomSampler(ljspeech_train))
|
||||
|
||||
# only batch=1 for validation is enabled
|
||||
valid_cargo = DataCargo(
|
||||
ljspeech_valid,
|
||||
valid_batch_fn,
|
||||
batch_size=1,
|
||||
sampler=SequentialSampler(ljspeech_valid))
|
||||
|
||||
make_output_tree(args.output)
|
||||
|
||||
# conditioner(upsampling net)
|
||||
conditioner_config = config["conditioner"]
|
||||
upsampling_factors = conditioner_config["upsampling_factors"]
|
||||
upsample_net = UpsampleNet(upscale_factors=upsampling_factors)
|
||||
freeze(upsample_net)
|
||||
|
||||
residual_channels = teacher_config["residual_channels"]
|
||||
loss_type = teacher_config["loss_type"]
|
||||
output_dim = teacher_config["output_dim"]
|
||||
log_scale_min = teacher_config["log_scale_min"]
|
||||
assert loss_type == "mog" and output_dim == 3, \
|
||||
"the teacher wavenet should be a wavenet with single gaussian output"
|
||||
|
||||
teacher = WaveNet(n_loop, n_layer, residual_channels, output_dim, n_mels,
|
||||
filter_size, loss_type, log_scale_min)
|
||||
freeze(teacher)
|
||||
|
||||
student_config = config["student"]
|
||||
n_loops = student_config["n_loops"]
|
||||
n_layers = student_config["n_layers"]
|
||||
student_residual_channels = student_config["residual_channels"]
|
||||
student_filter_size = student_config["filter_size"]
|
||||
student_log_scale_min = student_config["log_scale_min"]
|
||||
student = ParallelWaveNet(n_loops, n_layers, student_residual_channels,
|
||||
n_mels, student_filter_size)
|
||||
|
||||
stft_config = config["stft"]
|
||||
stft = STFT(
|
||||
n_fft=stft_config["n_fft"],
|
||||
hop_length=stft_config["hop_length"],
|
||||
win_length=stft_config["win_length"])
|
||||
|
||||
lmd = config["loss"]["lmd"]
|
||||
model = Clarinet(upsample_net, teacher, student, stft,
|
||||
student_log_scale_min, lmd)
|
||||
summary(model)
|
||||
|
||||
# optim
|
||||
train_config = config["train"]
|
||||
learning_rate = train_config["learning_rate"]
|
||||
anneal_rate = train_config["anneal_rate"]
|
||||
anneal_interval = train_config["anneal_interval"]
|
||||
lr_scheduler = dg.ExponentialDecay(
|
||||
learning_rate, anneal_interval, anneal_rate, staircase=True)
|
||||
gradiant_max_norm = train_config["gradient_max_norm"]
|
||||
optim = fluid.optimizer.Adam(
|
||||
lr_scheduler,
|
||||
parameter_list=model.parameters(),
|
||||
grad_clip=fluid.clip.ClipByGlobalNorm(gradiant_max_norm))
|
||||
|
||||
# train
|
||||
max_iterations = train_config["max_iterations"]
|
||||
checkpoint_interval = train_config["checkpoint_interval"]
|
||||
eval_interval = train_config["eval_interval"]
|
||||
checkpoint_dir = os.path.join(args.output, "checkpoints")
|
||||
state_dir = os.path.join(args.output, "states")
|
||||
log_dir = os.path.join(args.output, "log")
|
||||
writer = LogWriter(log_dir)
|
||||
|
||||
if args.checkpoint is not None:
|
||||
iteration = io.load_parameters(
|
||||
model, optim, checkpoint_path=args.checkpoint)
|
||||
else:
|
||||
iteration = io.load_parameters(
|
||||
model,
|
||||
optim,
|
||||
checkpoint_dir=checkpoint_dir,
|
||||
iteration=args.iteration)
|
||||
|
||||
if iteration == 0:
|
||||
assert args.wavenet is not None, "When training afresh, a trained wavenet model should be provided."
|
||||
load_wavenet(model, args.wavenet)
|
||||
|
||||
# loader
|
||||
train_loader = fluid.io.DataLoader.from_generator(
|
||||
capacity=10, return_list=True)
|
||||
train_loader.set_batch_generator(train_cargo, place)
|
||||
|
||||
valid_loader = fluid.io.DataLoader.from_generator(
|
||||
capacity=10, return_list=True)
|
||||
valid_loader.set_batch_generator(valid_cargo, place)
|
||||
|
||||
# training loop
|
||||
global_step = iteration + 1
|
||||
iterator = iter(tqdm(train_loader))
|
||||
while global_step <= max_iterations:
|
||||
try:
|
||||
batch = next(iterator)
|
||||
except StopIteration as e:
|
||||
iterator = iter(tqdm(train_loader))
|
||||
batch = next(iterator)
|
||||
|
||||
audios, mels, audio_starts = batch
|
||||
model.train()
|
||||
loss_dict = model(
|
||||
audios, mels, audio_starts, clip_kl=global_step > 500)
|
||||
|
||||
writer.add_scalar("learning_rate",
|
||||
optim._learning_rate.step().numpy()[0], global_step)
|
||||
for k, v in loss_dict.items():
|
||||
writer.add_scalar("loss/{}".format(k), v.numpy()[0], global_step)
|
||||
|
||||
l = loss_dict["loss"]
|
||||
step_loss = l.numpy()[0]
|
||||
print("[train] global_step: {} loss: {:<8.6f}".format(global_step,
|
||||
step_loss))
|
||||
|
||||
l.backward()
|
||||
optim.minimize(l)
|
||||
optim.clear_gradients()
|
||||
|
||||
if global_step % eval_interval == 0:
|
||||
# evaluate on valid dataset
|
||||
eval_model(model, valid_loader, state_dir, global_step,
|
||||
sample_rate)
|
||||
if global_step % checkpoint_interval == 0:
|
||||
io.save_parameters(checkpoint_dir, global_step, model, optim)
|
||||
|
||||
global_step += 1
|
|
@ -1,60 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import division
|
||||
import os
|
||||
import soundfile as sf
|
||||
from collections import OrderedDict
|
||||
|
||||
from paddle import fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
||||
|
||||
def make_output_tree(output_dir):
|
||||
checkpoint_dir = os.path.join(output_dir, "checkpoints")
|
||||
if not os.path.exists(checkpoint_dir):
|
||||
os.makedirs(checkpoint_dir)
|
||||
|
||||
state_dir = os.path.join(output_dir, "states")
|
||||
if not os.path.exists(state_dir):
|
||||
os.makedirs(state_dir)
|
||||
|
||||
|
||||
def eval_model(model, valid_loader, output_dir, iteration, sample_rate):
|
||||
model.eval()
|
||||
for i, batch in enumerate(valid_loader):
|
||||
# print("sentence {}".format(i))
|
||||
path = os.path.join(output_dir,
|
||||
"sentence_{}_step_{}.wav".format(i, iteration))
|
||||
audio_clips, mel_specs, audio_starts = batch
|
||||
wav_var = model.synthesis(mel_specs)
|
||||
wav_np = wav_var.numpy()[0]
|
||||
sf.write(path, wav_np, samplerate=sample_rate)
|
||||
print("generated {}".format(path))
|
||||
|
||||
|
||||
def load_wavenet(model, path):
|
||||
wavenet_dict, _ = dg.load_dygraph(path)
|
||||
encoder_dict = OrderedDict()
|
||||
teacher_dict = OrderedDict()
|
||||
for k, v in wavenet_dict.items():
|
||||
if k.startswith("encoder."):
|
||||
encoder_dict[k.split('.', 1)[1]] = v
|
||||
else:
|
||||
# k starts with "decoder."
|
||||
teacher_dict[k.split('.', 1)[1]] = v
|
||||
|
||||
model.encoder.set_dict(encoder_dict)
|
||||
model.teacher.set_dict(teacher_dict)
|
||||
print("loaded the encoder part and teacher part from wavenet model.")
|
|
@ -1,144 +0,0 @@
|
|||
# Deep Voice 3
|
||||
|
||||
PaddlePaddle dynamic graph implementation of Deep Voice 3, a convolutional network based text-to-speech generative model. The implementation is based on [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654).
|
||||
|
||||
We implement Deep Voice 3 using Paddle Fluid with dynamic graph, which is convenient for building flexible network architectures.
|
||||
|
||||
## Dataset
|
||||
|
||||
We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
|
||||
|
||||
```bash
|
||||
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
|
||||
tar xjvf LJSpeech-1.1.tar.bz2
|
||||
```
|
||||
|
||||
## Model Architecture
|
||||
|
||||

|
||||
|
||||
The model consists of an encoder, a decoder and a converter (and a speaker embedding for multispeaker models). The encoder and the decoder together form the seq2seq part of the model, and the converter forms the postnet part.
|
||||
|
||||
## Project Structure
|
||||
|
||||
```text
|
||||
├── config/
|
||||
├── synthesize.py
|
||||
├── data.py
|
||||
├── preprocess.py
|
||||
├── clip.py
|
||||
├── train.py
|
||||
└── vocoder.py
|
||||
```
|
||||
|
||||
# Preprocess
|
||||
|
||||
Preprocess to dataset with `preprocess.py`.
|
||||
|
||||
```text
|
||||
usage: preprocess.py [-h] --config CONFIG --input INPUT --output OUTPUT
|
||||
|
||||
preprocess ljspeech dataset and save it.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG config file
|
||||
--input INPUT data path of the original data
|
||||
--output OUTPUT path to save the preprocessed dataset
|
||||
```
|
||||
|
||||
example code:
|
||||
|
||||
```bash
|
||||
python preprocess.py --config=configs/ljspeech.yaml --input=LJSpeech-1.1/ --output=data/ljspeech
|
||||
```
|
||||
|
||||
## Train
|
||||
|
||||
Train the model using train.py, follow the usage displayed by `python train.py --help`.
|
||||
|
||||
```text
|
||||
usage: train.py [-h] --config CONFIG --input INPUT
|
||||
|
||||
train a Deep Voice 3 model with LJSpeech
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG config file
|
||||
--input INPUT data path of the original data
|
||||
```
|
||||
|
||||
example code:
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 python train.py --config=configs/ljspeech.yaml --input=data/ljspeech
|
||||
```
|
||||
|
||||
It would create a `runs` folder, outputs for each run is saved in a seperate folder in `runs`, whose name is the time joined with hostname. Inside this filder, tensorboard log, parameters and optimizer states are saved. Parameters(`*.pdparams`) and optimizer states(`*.pdopt`) are named by the step when they are saved.
|
||||
|
||||
```text
|
||||
runs/Jul07_09-39-34_instance-mqcyj27y-4/
|
||||
├── checkpoint
|
||||
├── events.out.tfevents.1594085974.instance-mqcyj27y-4
|
||||
├── step-1000000.pdopt
|
||||
├── step-1000000.pdparams
|
||||
├── step-100000.pdopt
|
||||
├── step-100000.pdparams
|
||||
...
|
||||
```
|
||||
|
||||
Since we use waveflow to synthesize audio while training, so download the trained waveflow model and extract it in current directory before training.
|
||||
|
||||
```bash
|
||||
wget https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_ckpt_1.0.zip
|
||||
unzip waveflow_res128_ljspeech_ckpt_1.0.zip
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Visualization
|
||||
|
||||
You can visualize training losses, check the attention and listen to the synthesized audio when training with teacher forcing.
|
||||
|
||||
example code:
|
||||
|
||||
```bash
|
||||
tensorboard --logdir=runs/ --host=$HOSTNAME --port=8000
|
||||
```
|
||||
|
||||
## Synthesis
|
||||
|
||||
```text
|
||||
usage: synthesize from a checkpoint [-h] --config CONFIG --input INPUT
|
||||
--output OUTPUT --checkpoint CHECKPOINT
|
||||
--monotonic_layers MONOTONIC_LAYERS
|
||||
[--vocoder {griffin-lim,waveflow}]
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG config file
|
||||
--input INPUT text file to synthesize
|
||||
--output OUTPUT path to save audio
|
||||
--checkpoint CHECKPOINT
|
||||
data path of the checkpoint
|
||||
--monotonic_layers MONOTONIC_LAYERS
|
||||
monotonic decoder layers' indices(start from 1)
|
||||
--vocoder {griffin-lim,waveflow}
|
||||
vocoder to use
|
||||
```
|
||||
|
||||
`synthesize.py` is used to synthesize several sentences in a text file.
|
||||
`--monotonic_layers` is the index of the decoders layer that manifest monotonic diagonal attention. You can get monotonic layers by inspecting tensorboard logs. Mind that the index starts from 1. The layers that manifest monotonic diagonal attention are stable for a model during training and synthesizing, but differ among different runs. So once you get the indices of monotonic layers by inspecting tensorboard log, you can use them at synthesizing. Note that only decoder layers that show strong diagonal attention should be considerd.
|
||||
`--vocoder` is the vocoder to use. Current supported values are "waveflow" and "griffin-lim". Default value is "waveflow".
|
||||
|
||||
example code:
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=2 python synthesize.py \
|
||||
--config configs/ljspeech.yaml \
|
||||
--input sentences.txt \
|
||||
--output outputs/ \
|
||||
--checkpoint runs/Jul07_09-39-34_instance-mqcyj27y-4/step-1320000 \
|
||||
--monotonic_layers "5,6" \
|
||||
--vocoder waveflow
|
||||
```
|
|
@ -1,84 +0,0 @@
|
|||
from __future__ import print_function
|
||||
|
||||
import copy
|
||||
import six
|
||||
import warnings
|
||||
|
||||
import functools
|
||||
from paddle.fluid import layers
|
||||
from paddle.fluid import framework
|
||||
from paddle.fluid import core
|
||||
from paddle.fluid import name_scope
|
||||
from paddle.fluid.dygraph import base as imperative_base
|
||||
from paddle.fluid.clip import GradientClipBase, _correct_clip_op_role_var
|
||||
|
||||
class DoubleClip(GradientClipBase):
|
||||
def __init__(self, clip_value, clip_norm, group_name="default_group", need_clip=None):
|
||||
super(DoubleClip, self).__init__(need_clip)
|
||||
self.clip_value = float(clip_value)
|
||||
self.clip_norm = float(clip_norm)
|
||||
self.group_name = group_name
|
||||
|
||||
def __str__(self):
|
||||
return "Gradient Clip By Value and GlobalNorm, value={}, global_norm={}".format(
|
||||
self.clip_value, self.clip_norm)
|
||||
|
||||
@imperative_base.no_grad
|
||||
def _dygraph_clip(self, params_grads):
|
||||
params_grads = self._dygraph_clip_by_value(params_grads)
|
||||
params_grads = self._dygraph_clip_by_global_norm(params_grads)
|
||||
return params_grads
|
||||
|
||||
@imperative_base.no_grad
|
||||
def _dygraph_clip_by_value(self, params_grads):
|
||||
params_and_grads = []
|
||||
for p, g in params_grads:
|
||||
if g is None:
|
||||
continue
|
||||
if self._need_clip_func is not None and not self._need_clip_func(p):
|
||||
params_and_grads.append((p, g))
|
||||
continue
|
||||
new_grad = layers.clip(x=g, min=-self.clip_value, max=self.clip_value)
|
||||
params_and_grads.append((p, new_grad))
|
||||
return params_and_grads
|
||||
|
||||
@imperative_base.no_grad
|
||||
def _dygraph_clip_by_global_norm(self, params_grads):
|
||||
params_and_grads = []
|
||||
sum_square_list = []
|
||||
for p, g in params_grads:
|
||||
if g is None:
|
||||
continue
|
||||
if self._need_clip_func is not None and not self._need_clip_func(p):
|
||||
continue
|
||||
merge_grad = g
|
||||
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
|
||||
merge_grad = layers.merge_selected_rows(g)
|
||||
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
|
||||
square = layers.square(merge_grad)
|
||||
sum_square = layers.reduce_sum(square)
|
||||
sum_square_list.append(sum_square)
|
||||
|
||||
# all parameters have been filterd out
|
||||
if len(sum_square_list) == 0:
|
||||
return params_grads
|
||||
|
||||
global_norm_var = layers.concat(sum_square_list)
|
||||
global_norm_var = layers.reduce_sum(global_norm_var)
|
||||
global_norm_var = layers.sqrt(global_norm_var)
|
||||
max_global_norm = layers.fill_constant(
|
||||
shape=[1], dtype='float32', value=self.clip_norm)
|
||||
clip_var = layers.elementwise_div(
|
||||
x=max_global_norm,
|
||||
y=layers.elementwise_max(
|
||||
x=global_norm_var, y=max_global_norm))
|
||||
for p, g in params_grads:
|
||||
if g is None:
|
||||
continue
|
||||
if self._need_clip_func is not None and not self._need_clip_func(p):
|
||||
params_and_grads.append((p, g))
|
||||
continue
|
||||
new_grad = layers.elementwise_mul(x=g, y=clip_var)
|
||||
params_and_grads.append((p, new_grad))
|
||||
|
||||
return params_and_grads
|
|
@ -1,46 +0,0 @@
|
|||
# data processing
|
||||
p_pronunciation: 0.99
|
||||
sample_rate: 22050 # Hz
|
||||
n_fft: 1024
|
||||
win_length: 1024
|
||||
hop_length: 256
|
||||
n_mels: 80
|
||||
reduction_factor: 4
|
||||
|
||||
# model-s2s
|
||||
n_speakers: 1
|
||||
speaker_dim: 16
|
||||
char_dim: 256
|
||||
encoder_dim: 64
|
||||
kernel_size: 5
|
||||
encoder_layers: 7
|
||||
decoder_layers: 8
|
||||
prenet_sizes: [128]
|
||||
attention_dim: 128
|
||||
|
||||
# model-postnet
|
||||
postnet_layers: 5
|
||||
postnet_dim: 256
|
||||
|
||||
# position embedding
|
||||
position_weight: 1.0
|
||||
position_rate: 5.54
|
||||
forward_step: 4
|
||||
backward_step: 0
|
||||
|
||||
dropout: 0.05
|
||||
|
||||
# output-griffinlim
|
||||
sharpening_factor: 1.4
|
||||
|
||||
# optimizer:
|
||||
learning_rate: 0.001
|
||||
clip_value: 5.0
|
||||
clip_norm: 100.0
|
||||
|
||||
# training:
|
||||
max_iteration: 1000000
|
||||
batch_size: 16
|
||||
report_interval: 10000
|
||||
save_interval: 10000
|
||||
valid_size: 5
|
|
@ -1,108 +0,0 @@
|
|||
import numpy as np
|
||||
import os
|
||||
import csv
|
||||
import pandas as pd
|
||||
|
||||
import paddle
|
||||
from paddle import fluid
|
||||
from paddle.fluid import dygraph as dg
|
||||
from paddle.fluid.dataloader import Dataset, BatchSampler
|
||||
from paddle.fluid.io import DataLoader
|
||||
|
||||
from parakeet.data import DatasetMixin, DataCargo, PartialyRandomizedSimilarTimeLengthSampler
|
||||
from parakeet.g2p import en
|
||||
|
||||
class LJSpeech(DatasetMixin):
|
||||
def __init__(self, root):
|
||||
self._root = root
|
||||
self._table = pd.read_csv(
|
||||
os.path.join(root, "metadata.csv"),
|
||||
sep="|",
|
||||
encoding="utf-8",
|
||||
quoting=csv.QUOTE_NONE,
|
||||
header=None,
|
||||
names=["num_frames", "spec_name", "mel_name", "text"],
|
||||
dtype={"num_frames": np.int64, "spec_name": str, "mel_name":str, "text":str})
|
||||
|
||||
def num_frames(self):
|
||||
return self._table["num_frames"].to_list()
|
||||
|
||||
def get_example(self, i):
|
||||
"""
|
||||
spec (T_frame, C_spec)
|
||||
mel (T_frame, C_mel)
|
||||
"""
|
||||
num_frames, spec_name, mel_name, text = self._table.iloc[i]
|
||||
spec = np.load(os.path.join(self._root, spec_name))
|
||||
mel = np.load(os.path.join(self._root, mel_name))
|
||||
return (text, spec, mel, num_frames)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._table)
|
||||
|
||||
class DataCollector(object):
|
||||
def __init__(self, p_pronunciation):
|
||||
self.p_pronunciation = p_pronunciation
|
||||
|
||||
def __call__(self, examples):
|
||||
"""
|
||||
output shape and dtype
|
||||
(B, T_text) int64
|
||||
(B,) int64
|
||||
(B, T_frame, C_spec) float32
|
||||
(B, T_frame, C_mel) float32
|
||||
(B,) int64
|
||||
"""
|
||||
text_seqs = []
|
||||
specs = []
|
||||
mels = []
|
||||
num_frames = np.array([example[3] for example in examples], dtype=np.int64)
|
||||
max_frames = np.max(num_frames)
|
||||
|
||||
for example in examples:
|
||||
text, spec, mel, _ = example
|
||||
text_seqs.append(en.text_to_sequence(text, self.p_pronunciation))
|
||||
specs.append(np.pad(spec, [(0, max_frames - spec.shape[0]), (0, 0)], mode="constant"))
|
||||
mels.append(np.pad(mel, [(0, max_frames - mel.shape[0]), (0, 0)], mode="constant"))
|
||||
|
||||
specs = np.stack(specs)
|
||||
mels = np.stack(mels)
|
||||
|
||||
text_lengths = np.array([len(seq) for seq in text_seqs], dtype=np.int64)
|
||||
max_length = np.max(text_lengths)
|
||||
text_seqs = np.array([seq + [0] * (max_length - len(seq)) for seq in text_seqs], dtype=np.int64)
|
||||
return text_seqs, text_lengths, specs, mels, num_frames
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import tqdm
|
||||
import time
|
||||
from ruamel import yaml
|
||||
|
||||
parser = argparse.ArgumentParser(description="load the preprocessed ljspeech dataset")
|
||||
parser.add_argument("--config", type=str, required=True, help="config file")
|
||||
parser.add_argument("--input", type=str, required=True, help="data path of the original data")
|
||||
args = parser.parse_args()
|
||||
with open(args.config, 'rt') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
print("========= Command Line Arguments ========")
|
||||
for k, v in vars(args).items():
|
||||
print("{}: {}".format(k, v))
|
||||
print("=========== Configurations ==============")
|
||||
for k in ["p_pronunciation", "batch_size"]:
|
||||
print("{}: {}".format(k, config[k]))
|
||||
|
||||
ljspeech = LJSpeech(args.input)
|
||||
collate_fn = DataCollector(config["p_pronunciation"])
|
||||
|
||||
dg.enable_dygraph(fluid.CPUPlace())
|
||||
sampler = PartialyRandomizedSimilarTimeLengthSampler(ljspeech.num_frames())
|
||||
cargo = DataCargo(ljspeech, collate_fn,
|
||||
batch_size=config["batch_size"], sampler=sampler)
|
||||
loader = DataLoader\
|
||||
.from_generator(capacity=5, return_list=True)\
|
||||
.set_batch_generator(cargo)
|
||||
|
||||
for i, batch in tqdm.tqdm(enumerate(loader)):
|
||||
continue
|
Binary file not shown.
Before Width: | Height: | Size: 447 KiB |
|
@ -1,122 +0,0 @@
|
|||
from __future__ import division
|
||||
import os
|
||||
import argparse
|
||||
from ruamel import yaml
|
||||
import tqdm
|
||||
from os.path import join
|
||||
import csv
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import librosa
|
||||
import logging
|
||||
|
||||
from parakeet.data import DatasetMixin
|
||||
|
||||
|
||||
class LJSpeechMetaData(DatasetMixin):
|
||||
def __init__(self, root):
|
||||
self.root = root
|
||||
self._wav_dir = join(root, "wavs")
|
||||
csv_path = join(root, "metadata.csv")
|
||||
self._table = pd.read_csv(
|
||||
csv_path,
|
||||
sep="|",
|
||||
encoding="utf-8",
|
||||
header=None,
|
||||
quoting=csv.QUOTE_NONE,
|
||||
names=["fname", "raw_text", "normalized_text"])
|
||||
|
||||
def get_example(self, i):
|
||||
fname, raw_text, normalized_text = self._table.iloc[i]
|
||||
abs_fname = join(self._wav_dir, fname + ".wav")
|
||||
return fname, abs_fname, raw_text, normalized_text
|
||||
|
||||
def __len__(self):
|
||||
return len(self._table)
|
||||
|
||||
|
||||
class Transform(object):
|
||||
def __init__(self, sample_rate, n_fft, hop_length, win_length, n_mels, reduction_factor):
|
||||
self.sample_rate = sample_rate
|
||||
self.n_fft = n_fft
|
||||
self.win_length = win_length
|
||||
self.hop_length = hop_length
|
||||
self.n_mels = n_mels
|
||||
self.reduction_factor = reduction_factor
|
||||
|
||||
def __call__(self, fname):
|
||||
# wave processing
|
||||
audio, _ = librosa.load(fname, sr=self.sample_rate)
|
||||
|
||||
# Pad the data to the right size to have a whole number of timesteps,
|
||||
# accounting properly for the model reduction factor.
|
||||
frames = audio.size // (self.reduction_factor * self.hop_length) + 1
|
||||
# librosa's stft extract frame of n_fft size, so we should pad n_fft // 2 on both sidess
|
||||
desired_length = (frames * self.reduction_factor - 1) * self.hop_length + self.n_fft
|
||||
pad_amount = (desired_length - audio.size) // 2
|
||||
|
||||
# we pad mannually to control the number of generated frames
|
||||
if audio.size % 2 == 0:
|
||||
audio = np.pad(audio, (pad_amount, pad_amount), mode='reflect')
|
||||
else:
|
||||
audio = np.pad(audio, (pad_amount, pad_amount + 1), mode='reflect')
|
||||
|
||||
# STFT
|
||||
D = librosa.stft(audio, self.n_fft, self.hop_length, self.win_length, center=False)
|
||||
S = np.abs(D)
|
||||
S_mel = librosa.feature.melspectrogram(sr=self.sample_rate, S=S, n_mels=self.n_mels, fmax=8000.0)
|
||||
|
||||
# log magnitude
|
||||
log_spectrogram = np.log(np.clip(S, a_min=1e-5, a_max=None))
|
||||
log_mel_spectrogram = np.log(np.clip(S_mel, a_min=1e-5, a_max=None))
|
||||
num_frames = log_spectrogram.shape[-1]
|
||||
assert num_frames % self.reduction_factor == 0, "num_frames is wrong"
|
||||
return (log_spectrogram.T, log_mel_spectrogram.T, num_frames)
|
||||
|
||||
|
||||
def save(output_path, dataset, transform):
|
||||
if not os.path.exists(output_path):
|
||||
os.makedirs(output_path)
|
||||
records = []
|
||||
for example in tqdm.tqdm(dataset):
|
||||
fname, abs_fname, _, normalized_text = example
|
||||
log_spec, log_mel_spec, num_frames = transform(abs_fname)
|
||||
records.append((num_frames,
|
||||
fname + "_spec.npy",
|
||||
fname + "_mel.npy",
|
||||
normalized_text))
|
||||
np.save(join(output_path, fname + "_spec"), log_spec)
|
||||
np.save(join(output_path, fname + "_mel"), log_mel_spec)
|
||||
meta_data = pd.DataFrame.from_records(records)
|
||||
meta_data.to_csv(join(output_path, "metadata.csv"),
|
||||
quoting=csv.QUOTE_NONE, sep="|", encoding="utf-8",
|
||||
header=False, index=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="preprocess ljspeech dataset and save it.")
|
||||
parser.add_argument("--config", type=str, required=True, help="config file")
|
||||
parser.add_argument("--input", type=str, required=True, help="data path of the original data")
|
||||
parser.add_argument("--output", type=str, required=True, help="path to save the preprocessed dataset")
|
||||
|
||||
args = parser.parse_args()
|
||||
with open(args.config, 'rt') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
print("========= Command Line Arguments ========")
|
||||
for k, v in vars(args).items():
|
||||
print("{}: {}".format(k, v))
|
||||
print("=========== Configurations ==============")
|
||||
for k in ["sample_rate", "n_fft", "win_length",
|
||||
"hop_length", "n_mels", "reduction_factor"]:
|
||||
print("{}: {}".format(k, config[k]))
|
||||
|
||||
ljspeech_meta = LJSpeechMetaData(args.input)
|
||||
transform = Transform(config["sample_rate"],
|
||||
config["n_fft"],
|
||||
config["hop_length"],
|
||||
config["win_length"],
|
||||
config["n_mels"],
|
||||
config["reduction_factor"])
|
||||
save(args.output, ljspeech_meta, transform)
|
||||
|
|
@ -1,101 +0,0 @@
|
|||
import numpy as np
|
||||
from matplotlib import cm
|
||||
import librosa
|
||||
import os
|
||||
import time
|
||||
import tqdm
|
||||
import argparse
|
||||
from ruamel import yaml
|
||||
import paddle
|
||||
from paddle import fluid
|
||||
from paddle.fluid import layers as F
|
||||
from paddle.fluid import dygraph as dg
|
||||
from paddle.fluid.io import DataLoader
|
||||
import soundfile as sf
|
||||
|
||||
from parakeet.data import SliceDataset, DataCargo, PartialyRandomizedSimilarTimeLengthSampler, SequentialSampler
|
||||
from parakeet.utils.io import save_parameters, load_parameters, add_yaml_config_to_args
|
||||
from parakeet.g2p import en
|
||||
from parakeet.models.deepvoice3.weight_norm_hook import remove_weight_norm
|
||||
from vocoder import WaveflowVocoder, GriffinLimVocoder
|
||||
from train import create_model
|
||||
|
||||
|
||||
def main(args, config):
|
||||
model = create_model(config)
|
||||
loaded_step = load_parameters(model, checkpoint_path=args.checkpoint)
|
||||
for name, layer in model.named_sublayers():
|
||||
try:
|
||||
remove_weight_norm(layer)
|
||||
except ValueError:
|
||||
# this layer has not weight norm hook
|
||||
pass
|
||||
model.eval()
|
||||
if args.vocoder == "waveflow":
|
||||
vocoder = WaveflowVocoder()
|
||||
vocoder.model.eval()
|
||||
elif args.vocoder == "griffin-lim":
|
||||
vocoder = GriffinLimVocoder(
|
||||
sharpening_factor=config["sharpening_factor"],
|
||||
sample_rate=config["sample_rate"],
|
||||
n_fft=config["n_fft"],
|
||||
win_length=config["win_length"],
|
||||
hop_length=config["hop_length"])
|
||||
else:
|
||||
raise ValueError("Other vocoders are not supported.")
|
||||
|
||||
if not os.path.exists(args.output):
|
||||
os.makedirs(args.output)
|
||||
monotonic_layers = [int(item.strip()) - 1 for item in args.monotonic_layers.split(',')]
|
||||
with open(args.input, 'rt') as f:
|
||||
sentences = [line.strip() for line in f.readlines()]
|
||||
for i, sentence in enumerate(sentences):
|
||||
wav = synthesize(args, config, model, vocoder, sentence, monotonic_layers)
|
||||
sf.write(os.path.join(args.output, "sentence{}.wav".format(i)),
|
||||
wav, samplerate=config["sample_rate"])
|
||||
|
||||
|
||||
def synthesize(args, config, model, vocoder, sentence, monotonic_layers):
|
||||
print("[synthesize] {}".format(sentence))
|
||||
text = en.text_to_sequence(sentence, p=1.0)
|
||||
text = np.expand_dims(np.array(text, dtype="int64"), 0)
|
||||
lengths = np.array([text.size], dtype=np.int64)
|
||||
text_seqs = dg.to_variable(text)
|
||||
text_lengths = dg.to_variable(lengths)
|
||||
|
||||
decoder_layers = config["decoder_layers"]
|
||||
force_monotonic_attention = [False] * decoder_layers
|
||||
for i in monotonic_layers:
|
||||
force_monotonic_attention[i] = True
|
||||
|
||||
with dg.no_grad():
|
||||
outputs = model(text_seqs, text_lengths, speakers=None,
|
||||
force_monotonic_attention=force_monotonic_attention,
|
||||
window=(config["backward_step"], config["forward_step"]))
|
||||
decoded, refined, attentions = outputs
|
||||
if args.vocoder == "griffin-lim":
|
||||
wav_np = vocoder(refined.numpy()[0].T)
|
||||
else:
|
||||
wav = vocoder(F.transpose(refined, (0, 2, 1)))
|
||||
wav_np = wav.numpy()[0]
|
||||
return wav_np
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
from ruamel import yaml
|
||||
parser = argparse.ArgumentParser("synthesize from a checkpoint")
|
||||
parser.add_argument("--config", type=str, required=True, help="config file")
|
||||
parser.add_argument("--input", type=str, required=True, help="text file to synthesize")
|
||||
parser.add_argument("--output", type=str, required=True, help="path to save audio")
|
||||
parser.add_argument("--checkpoint", type=str, required=True, help="data path of the checkpoint")
|
||||
parser.add_argument("--monotonic_layers", type=str, required=True, help="monotonic decoder layers' indices(start from 1)")
|
||||
parser.add_argument("--vocoder", type=str, default="waveflow", choices=['griffin-lim', 'waveflow'], help="vocoder to use")
|
||||
args = parser.parse_args()
|
||||
with open(args.config, 'rt') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
dg.enable_dygraph(fluid.CUDAPlace(0))
|
||||
main(args, config)
|
|
@ -1,187 +0,0 @@
|
|||
import numpy as np
|
||||
from matplotlib import cm
|
||||
import librosa
|
||||
import os
|
||||
import time
|
||||
import tqdm
|
||||
import paddle
|
||||
from paddle import fluid
|
||||
from paddle.fluid import layers as F
|
||||
from paddle.fluid import initializer as I
|
||||
from paddle.fluid import dygraph as dg
|
||||
from paddle.fluid.io import DataLoader
|
||||
from visualdl import LogWriter
|
||||
|
||||
from parakeet.models.deepvoice3 import Encoder, Decoder, PostNet, SpectraNet
|
||||
from parakeet.data import SliceDataset, DataCargo, SequentialSampler, RandomSampler
|
||||
from parakeet.utils.io import save_parameters, load_parameters
|
||||
from parakeet.g2p import en
|
||||
|
||||
from data import LJSpeech, DataCollector
|
||||
from vocoder import WaveflowVocoder, GriffinLimVocoder
|
||||
from clip import DoubleClip
|
||||
|
||||
|
||||
def create_model(config):
|
||||
char_embedding = dg.Embedding((en.n_vocab, config["char_dim"]), param_attr=I.Normal(scale=0.1))
|
||||
multi_speaker = config["n_speakers"] > 1
|
||||
speaker_embedding = dg.Embedding((config["n_speakers"], config["speaker_dim"]), param_attr=I.Normal(scale=0.1)) \
|
||||
if multi_speaker else None
|
||||
encoder = Encoder(config["encoder_layers"], config["char_dim"],
|
||||
config["encoder_dim"], config["kernel_size"],
|
||||
has_bias=multi_speaker, bias_dim=config["speaker_dim"],
|
||||
keep_prob=1.0 - config["dropout"])
|
||||
decoder = Decoder(config["n_mels"], config["reduction_factor"],
|
||||
list(config["prenet_sizes"]) + [config["char_dim"]],
|
||||
config["decoder_layers"], config["kernel_size"],
|
||||
config["attention_dim"],
|
||||
position_encoding_weight=config["position_weight"],
|
||||
omega=config["position_rate"],
|
||||
has_bias=multi_speaker, bias_dim=config["speaker_dim"],
|
||||
keep_prob=1.0 - config["dropout"])
|
||||
postnet = PostNet(config["postnet_layers"], config["char_dim"],
|
||||
config["postnet_dim"], config["kernel_size"],
|
||||
config["n_mels"], config["reduction_factor"],
|
||||
has_bias=multi_speaker, bias_dim=config["speaker_dim"],
|
||||
keep_prob=1.0 - config["dropout"])
|
||||
spectranet = SpectraNet(char_embedding, speaker_embedding, encoder, decoder, postnet)
|
||||
return spectranet
|
||||
|
||||
def create_data(config, data_path):
|
||||
dataset = LJSpeech(data_path)
|
||||
|
||||
train_dataset = SliceDataset(dataset, config["valid_size"], len(dataset))
|
||||
train_collator = DataCollector(config["p_pronunciation"])
|
||||
train_sampler = RandomSampler(train_dataset)
|
||||
train_cargo = DataCargo(train_dataset, train_collator,
|
||||
batch_size=config["batch_size"], sampler=train_sampler)
|
||||
train_loader = DataLoader\
|
||||
.from_generator(capacity=10, return_list=True)\
|
||||
.set_batch_generator(train_cargo)
|
||||
|
||||
valid_dataset = SliceDataset(dataset, 0, config["valid_size"])
|
||||
valid_collector = DataCollector(1.)
|
||||
valid_sampler = SequentialSampler(valid_dataset)
|
||||
valid_cargo = DataCargo(valid_dataset, valid_collector,
|
||||
batch_size=1, sampler=valid_sampler)
|
||||
valid_loader = DataLoader\
|
||||
.from_generator(capacity=2, return_list=True)\
|
||||
.set_batch_generator(valid_cargo)
|
||||
return train_loader, valid_loader
|
||||
|
||||
def create_optimizer(model, config):
|
||||
optim = fluid.optimizer.Adam(config["learning_rate"],
|
||||
parameter_list=model.parameters(),
|
||||
grad_clip=DoubleClip(config["clip_value"], config["clip_norm"]))
|
||||
return optim
|
||||
|
||||
def train(args, config):
|
||||
model = create_model(config)
|
||||
train_loader, valid_loader = create_data(config, args.input)
|
||||
optim = create_optimizer(model, config)
|
||||
|
||||
global global_step
|
||||
max_iteration = config["max_iteration"]
|
||||
|
||||
iterator = iter(tqdm.tqdm(train_loader))
|
||||
while global_step <= max_iteration:
|
||||
# get inputs
|
||||
try:
|
||||
batch = next(iterator)
|
||||
except StopIteration:
|
||||
iterator = iter(tqdm.tqdm(train_loader))
|
||||
batch = next(iterator)
|
||||
|
||||
# unzip it
|
||||
text_seqs, text_lengths, specs, mels, num_frames = batch
|
||||
|
||||
# forward & backward
|
||||
model.train()
|
||||
outputs = model(text_seqs, text_lengths, speakers=None, mel=mels)
|
||||
decoded, refined, attentions, final_state = outputs
|
||||
|
||||
causal_mel_loss = model.spec_loss(decoded, mels, num_frames)
|
||||
non_causal_mel_loss = model.spec_loss(refined, mels, num_frames)
|
||||
loss = causal_mel_loss + non_causal_mel_loss
|
||||
loss.backward()
|
||||
|
||||
# update
|
||||
optim.minimize(loss)
|
||||
|
||||
# logging
|
||||
tqdm.tqdm.write("[train] step: {}\tloss: {:.6f}\tcausal:{:.6f}\tnon_causal:{:.6f}".format(
|
||||
global_step,
|
||||
loss.numpy()[0],
|
||||
causal_mel_loss.numpy()[0],
|
||||
non_causal_mel_loss.numpy()[0]))
|
||||
writer.add_scalar("loss/causal_mel_loss", causal_mel_loss.numpy()[0], step=global_step)
|
||||
writer.add_scalar("loss/non_causal_mel_loss", non_causal_mel_loss.numpy()[0], step=global_step)
|
||||
writer.add_scalar("loss/loss", loss.numpy()[0], step=global_step)
|
||||
|
||||
if global_step % config["report_interval"] == 0:
|
||||
text_length = int(text_lengths.numpy()[0])
|
||||
num_frame = int(num_frames.numpy()[0])
|
||||
|
||||
tag = "train_mel/ground-truth"
|
||||
img = cm.viridis(normalize(mels.numpy()[0, :num_frame].T))
|
||||
writer.add_image(tag, img, step=global_step)
|
||||
|
||||
tag = "train_mel/decoded"
|
||||
img = cm.viridis(normalize(decoded.numpy()[0, :num_frame].T))
|
||||
writer.add_image(tag, img, step=global_step)
|
||||
|
||||
tag = "train_mel/refined"
|
||||
img = cm.viridis(normalize(refined.numpy()[0, :num_frame].T))
|
||||
writer.add_image(tag, img, step=global_step)
|
||||
|
||||
vocoder = WaveflowVocoder()
|
||||
vocoder.model.eval()
|
||||
|
||||
tag = "train_audio/ground-truth-waveflow"
|
||||
wav = vocoder(F.transpose(mels[0:1, :num_frame, :], (0, 2, 1)))
|
||||
writer.add_audio(tag, wav.numpy()[0], step=global_step, sample_rate=22050)
|
||||
|
||||
tag = "train_audio/decoded-waveflow"
|
||||
wav = vocoder(F.transpose(decoded[0:1, :num_frame, :], (0, 2, 1)))
|
||||
writer.add_audio(tag, wav.numpy()[0], step=global_step, sample_rate=22050)
|
||||
|
||||
tag = "train_audio/refined-waveflow"
|
||||
wav = vocoder(F.transpose(refined[0:1, :num_frame, :], (0, 2, 1)))
|
||||
writer.add_audio(tag, wav.numpy()[0], step=global_step, sample_rate=22050)
|
||||
|
||||
attentions_np = attentions.numpy()
|
||||
attentions_np = attentions_np[:, 0, :num_frame // 4 , :text_length]
|
||||
for i, attention_layer in enumerate(np.rot90(attentions_np, axes=(1,2))):
|
||||
tag = "train_attention/layer_{}".format(i)
|
||||
img = cm.viridis(normalize(attention_layer))
|
||||
writer.add_image(tag, img, step=global_step, dataformats="HWC")
|
||||
|
||||
if global_step % config["save_interval"] == 0:
|
||||
save_parameters(writer.logdir, global_step, model, optim)
|
||||
|
||||
# global step +1
|
||||
global_step += 1
|
||||
|
||||
def normalize(arr):
|
||||
return (arr - arr.min()) / (arr.max() - arr.min())
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
from ruamel import yaml
|
||||
|
||||
parser = argparse.ArgumentParser(description="train a Deep Voice 3 model with LJSpeech")
|
||||
parser.add_argument("--config", type=str, required=True, help="config file")
|
||||
parser.add_argument("--input", type=str, required=True, help="data path of the original data")
|
||||
|
||||
args = parser.parse_args()
|
||||
with open(args.config, 'rt') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
dg.enable_dygraph(fluid.CUDAPlace(0))
|
||||
global global_step
|
||||
global_step = 1
|
||||
global writer
|
||||
writer = LogWriter()
|
||||
print("[Training] tensorboard log and checkpoints are save in {}".format(
|
||||
writer.logdir))
|
||||
train(args, config)
|
|
@ -1,51 +0,0 @@
|
|||
import argparse
|
||||
from ruamel import yaml
|
||||
import numpy as np
|
||||
import librosa
|
||||
import paddle
|
||||
from paddle import fluid
|
||||
from paddle.fluid import layers as F
|
||||
from paddle.fluid import dygraph as dg
|
||||
from parakeet.utils.io import load_parameters
|
||||
from parakeet.models.waveflow.waveflow_modules import WaveFlowModule
|
||||
|
||||
class WaveflowVocoder(object):
|
||||
def __init__(self):
|
||||
config_path = "waveflow_res128_ljspeech_ckpt_1.0/waveflow_ljspeech.yaml"
|
||||
with open(config_path, 'rt') as f:
|
||||
config = yaml.safe_load(f)
|
||||
ns = argparse.Namespace()
|
||||
for k, v in config.items():
|
||||
setattr(ns, k, v)
|
||||
ns.use_fp16 = False
|
||||
|
||||
self.model = WaveFlowModule(ns)
|
||||
checkpoint_path = "waveflow_res128_ljspeech_ckpt_1.0/step-2000000"
|
||||
load_parameters(self.model, checkpoint_path=checkpoint_path)
|
||||
|
||||
def __call__(self, mel):
|
||||
with dg.no_grad():
|
||||
self.model.eval()
|
||||
audio = self.model.synthesize(mel)
|
||||
self.model.train()
|
||||
return audio
|
||||
|
||||
class GriffinLimVocoder(object):
|
||||
def __init__(self, sharpening_factor=1.4, sample_rate=22050, n_fft=1024,
|
||||
win_length=1024, hop_length=256):
|
||||
self.sample_rate = sample_rate
|
||||
self.n_fft = n_fft
|
||||
self.sharpening_factor = sharpening_factor
|
||||
self.win_length = win_length
|
||||
self.hop_length = hop_length
|
||||
|
||||
def __call__(self, mel):
|
||||
spec = librosa.feature.inverse.mel_to_stft(
|
||||
np.exp(mel),
|
||||
sr=self.sample_rate,
|
||||
n_fft=self.n_fft,
|
||||
fmin=0, fmax=8000.0, power=1.0)
|
||||
audio = librosa.core.griffinlim(spec ** self.sharpening_factor,
|
||||
win_length=self.win_length, hop_length=self.hop_length)
|
||||
return audio
|
||||
|
|
@ -1,144 +0,0 @@
|
|||
# Fastspeech
|
||||
|
||||
PaddlePaddle dynamic graph implementation of Fastspeech, a feed-forward network based on Transformer. The implementation is based on [FastSpeech: Fast, Robust and Controllable Text to Speech](https://arxiv.org/abs/1905.09263).
|
||||
|
||||
## Dataset
|
||||
|
||||
We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
|
||||
|
||||
```bash
|
||||
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
|
||||
tar xjvf LJSpeech-1.1.tar.bz2
|
||||
```
|
||||
|
||||
## Model Architecture
|
||||
|
||||

|
||||
|
||||
FastSpeech is a feed-forward structure based on Transformer, instead of using the encoder-attention-decoder based architecture. This model extracts attention alignments from an encoder-decoder based teacher model for phoneme duration prediction, which is used by a length
|
||||
regulator to expand the source phoneme sequence to match the length of the target
|
||||
mel-spectrogram sequence for parallel mel-spectrogram generation. We use the TransformerTTS as teacher model.
|
||||
The model consists of encoder, decoder and length regulator three parts.
|
||||
|
||||
## Project Structure
|
||||
|
||||
```text
|
||||
├── config # yaml configuration files
|
||||
├── synthesis.py # script to synthesize waveform from text
|
||||
├── train.py # script for model training
|
||||
```
|
||||
|
||||
## Saving & Loading
|
||||
|
||||
`train_transformer.py` and `train_vocoer.py` have 3 arguments in common, `--checkpoint`, `--iteration` and `--output`.
|
||||
|
||||
1. `--output` is the directory for saving results.
|
||||
During training, checkpoints are saved in `${output}/checkpoints` and tensorboard logs are saved in `${output}/log`.
|
||||
During synthesis, results are saved in `${output}/samples` and tensorboard log is save in `${output}/log`.
|
||||
|
||||
2. `--checkpoint` is the path of a checkpoint and `--iteration` is the target step. They are used to load checkpoints in the following way.
|
||||
|
||||
- If `--checkpoint` is provided, the checkpoint specified by `--checkpoint` is loaded.
|
||||
|
||||
- If `--checkpoint` is not provided, we try to load the checkpoint of the target step specified by `--iteration` from the `${output}/checkpoints/` directory, e.g. if given `--iteration 120000`, the checkpoint `${output}/checkpoints/step-120000.*` will be load.
|
||||
|
||||
- If both `--checkpoint` and `--iteration` are not provided, we try to load the latest checkpoint from `${output}/checkpoints/` directory.
|
||||
|
||||
## Compute Phoneme Duration
|
||||
|
||||
A ground truth duration of each phoneme (number of frames in the spectrogram that correspond to that phoneme) should be provided when training a FastSpeech model.
|
||||
|
||||
We compute the ground truth duration of each phomemes in the following way.
|
||||
We extract the encoder-decoder attention alignment from a trained Transformer TTS model;
|
||||
Each frame is considered corresponding to the phoneme that receive the most attention;
|
||||
|
||||
You can run alignments/get_alignments.py to get it.
|
||||
|
||||
```bash
|
||||
cd alignments
|
||||
python get_alignments.py \
|
||||
--use_gpu=1 \
|
||||
--output='./alignments' \
|
||||
--data=${DATAPATH} \
|
||||
--config=${CONFIG} \
|
||||
--checkpoint_transformer=${CHECKPOINT} \
|
||||
```
|
||||
|
||||
where `${DATAPATH}` is the path saved LJSpeech data, `${CHECKPOINT}` is the pretrain model path of TransformerTTS, `${CONFIG}` is the config yaml file of TransformerTTS checkpoint. It is necessary for you to prepare a pre-trained TranformerTTS checkpoint.
|
||||
|
||||
For more help on arguments
|
||||
|
||||
``python alignments.py --help``.
|
||||
|
||||
Or you can use your own phoneme duration, you just need to process the data into the following format.
|
||||
|
||||
```bash
|
||||
{'fname1': alignment1,
|
||||
'fname2': alignment2,
|
||||
...}
|
||||
```
|
||||
|
||||
## Train FastSpeech
|
||||
|
||||
FastSpeech model can be trained by running ``train.py``.
|
||||
|
||||
```bash
|
||||
python train.py \
|
||||
--use_gpu=1 \
|
||||
--data=${DATAPATH} \
|
||||
--alignments_path=${ALIGNMENTS_PATH} \
|
||||
--output=${OUTPUTPATH} \
|
||||
--config='configs/ljspeech.yaml' \
|
||||
```
|
||||
|
||||
Or you can run the script file directly.
|
||||
|
||||
```bash
|
||||
sh train.sh
|
||||
```
|
||||
|
||||
If you want to train on multiple GPUs, start training in the following way.
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3
|
||||
python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog train.py \
|
||||
--use_gpu=1 \
|
||||
--data=${DATAPATH} \
|
||||
--alignments_path=${ALIGNMENTS_PATH} \
|
||||
--output=${OUTPUTPATH} \
|
||||
--config='configs/ljspeech.yaml' \
|
||||
```
|
||||
|
||||
If you wish to resume from an existing model, See [Saving-&-Loading](#Saving-&-Loading) for details of checkpoint loading.
|
||||
|
||||
For more help on arguments
|
||||
|
||||
``python train.py --help``.
|
||||
|
||||
## Synthesis
|
||||
|
||||
After training the FastSpeech, audio can be synthesized by running ``synthesis.py``.
|
||||
|
||||
```bash
|
||||
python synthesis.py \
|
||||
--use_gpu=1 \
|
||||
--alpha=1.0 \
|
||||
--checkpoint=${CHECKPOINTPATH} \
|
||||
--config='configs/ljspeech.yaml' \
|
||||
--output=${OUTPUTPATH} \
|
||||
--vocoder='griffin-lim' \
|
||||
```
|
||||
|
||||
We currently support two vocoders, Griffin-Lim algorithm and WaveFlow. You can set ``--vocoder`` to use one of them. If you want to use WaveFlow as your vocoder, you need to set ``--config_vocoder`` and ``--checkpoint_vocoder`` which are the path of the config and checkpoint of vocoder. You can download the pre-trained model of WaveFlow from [here](https://github.com/PaddlePaddle/Parakeet#vocoders).
|
||||
|
||||
Or you can run the script file directly.
|
||||
|
||||
```bash
|
||||
sh synthesis.sh
|
||||
```
|
||||
|
||||
For more help on arguments
|
||||
|
||||
``python synthesis.py --help``.
|
||||
|
||||
Then you can find the synthesized audio files in ``${OUTPUTPATH}/samples``.
|
|
@ -1,132 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
from scipy.io.wavfile import write
|
||||
from parakeet.g2p.en import text_to_sequence
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import csv
|
||||
from tqdm import tqdm
|
||||
from ruamel import yaml
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
from pprint import pprint
|
||||
from collections import OrderedDict
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
from parakeet.models.transformer_tts.utils import *
|
||||
from parakeet.models.transformer_tts import TransformerTTS
|
||||
from parakeet.models.fastspeech.utils import get_alignment
|
||||
from parakeet.utils import io
|
||||
|
||||
|
||||
def add_config_options_to_parser(parser):
|
||||
parser.add_argument("--config", type=str, help="path of the config file")
|
||||
parser.add_argument("--use_gpu", type=int, default=0, help="device to use")
|
||||
parser.add_argument("--data", type=str, help="path of LJspeech dataset")
|
||||
|
||||
parser.add_argument(
|
||||
"--checkpoint_transformer",
|
||||
type=str,
|
||||
help="transformer_tts checkpoint to synthesis")
|
||||
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="./alignments",
|
||||
help="path to save experiment results")
|
||||
|
||||
|
||||
def alignments(args):
|
||||
local_rank = dg.parallel.Env().local_rank
|
||||
place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())
|
||||
|
||||
with open(args.config) as f:
|
||||
cfg = yaml.load(f, Loader=yaml.Loader)
|
||||
|
||||
with dg.guard(place):
|
||||
network_cfg = cfg['network']
|
||||
model = TransformerTTS(
|
||||
network_cfg['embedding_size'], network_cfg['hidden_size'],
|
||||
network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
|
||||
cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
|
||||
network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
|
||||
# Load parameters.
|
||||
global_step = io.load_parameters(
|
||||
model=model, checkpoint_path=args.checkpoint_transformer)
|
||||
model.eval()
|
||||
|
||||
# get text data
|
||||
root = Path(args.data)
|
||||
csv_path = root.joinpath("metadata.csv")
|
||||
table = pd.read_csv(
|
||||
csv_path,
|
||||
sep="|",
|
||||
header=None,
|
||||
quoting=csv.QUOTE_NONE,
|
||||
names=["fname", "raw_text", "normalized_text"])
|
||||
|
||||
pbar = tqdm(range(len(table)))
|
||||
alignments = OrderedDict()
|
||||
for i in pbar:
|
||||
fname, raw_text, normalized_text = table.iloc[i]
|
||||
# init input
|
||||
text = np.asarray(text_to_sequence(normalized_text))
|
||||
text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
|
||||
pos_text = np.arange(1, text.shape[1] + 1)
|
||||
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
|
||||
|
||||
# load
|
||||
wav, _ = librosa.load(
|
||||
str(os.path.join(args.data, 'wavs', fname + ".wav")))
|
||||
|
||||
spec = librosa.stft(
|
||||
y=wav,
|
||||
n_fft=cfg['audio']['n_fft'],
|
||||
win_length=cfg['audio']['win_length'],
|
||||
hop_length=cfg['audio']['hop_length'])
|
||||
mag = np.abs(spec)
|
||||
mel = librosa.filters.mel(sr=cfg['audio']['sr'],
|
||||
n_fft=cfg['audio']['n_fft'],
|
||||
n_mels=cfg['audio']['num_mels'],
|
||||
fmin=cfg['audio']['fmin'],
|
||||
fmax=cfg['audio']['fmax'])
|
||||
mel = np.matmul(mel, mag)
|
||||
mel = np.log(np.maximum(mel, 1e-5))
|
||||
|
||||
mel_input = np.transpose(mel, axes=(1, 0))
|
||||
mel_input = fluid.layers.unsqueeze(dg.to_variable(mel_input), [0])
|
||||
mel_lens = mel_input.shape[1]
|
||||
|
||||
pos_mel = np.arange(1, mel_input.shape[1] + 1)
|
||||
pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
|
||||
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
|
||||
text, mel_input, pos_text, pos_mel)
|
||||
mel_input = fluid.layers.concat(
|
||||
[mel_input, postnet_pred[:, -1:, :]], axis=1)
|
||||
|
||||
alignment, _ = get_alignment(attn_probs, mel_lens,
|
||||
network_cfg['decoder_num_head'])
|
||||
alignments[fname] = alignment
|
||||
with open(args.output + '.pkl', "wb") as f:
|
||||
pickle.dump(alignments, f)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Get alignments from TransformerTTS model")
|
||||
add_config_options_to_parser(parser)
|
||||
args = parser.parse_args()
|
||||
alignments(args)
|
|
@ -1,14 +0,0 @@
|
|||
|
||||
CUDA_VISIBLE_DEVICES=0 \
|
||||
python -u get_alignments.py \
|
||||
--use_gpu=1 \
|
||||
--output='./alignments' \
|
||||
--data='../../../dataset/LJSpeech-1.1' \
|
||||
--config='../../transformer_tts/configs/ljspeech.yaml' \
|
||||
--checkpoint_transformer='../../transformer_tts/checkpoint/transformer/step-120000' \
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in training!"
|
||||
exit 1
|
||||
fi
|
||||
exit 0
|
|
@ -1,36 +0,0 @@
|
|||
audio:
|
||||
num_mels: 80 #the number of mel bands when calculating mel spectrograms.
|
||||
n_fft: 1024 #the number of fft components.
|
||||
sr: 22050 #the sampling rate of audio data file.
|
||||
hop_length: 256 #the number of samples to advance between frames.
|
||||
win_length: 1024 #the length (width) of the window function.
|
||||
preemphasis: 0.97
|
||||
power: 1.2 #the power to raise before griffin-lim.
|
||||
fmin: 0
|
||||
fmax: 8000
|
||||
|
||||
network:
|
||||
encoder_n_layer: 6 #the number of FFT Block in encoder.
|
||||
encoder_head: 2 #the attention head number in encoder.
|
||||
encoder_conv1d_filter_size: 1536 #the filter size of conv1d in encoder.
|
||||
max_seq_len: 2048 #the max length of sequence.
|
||||
decoder_n_layer: 6 #the number of FFT Block in decoder.
|
||||
decoder_head: 2 #the attention head number in decoder.
|
||||
decoder_conv1d_filter_size: 1536 #the filter size of conv1d in decoder.
|
||||
hidden_size: 384 #the hidden size in model of fastspeech.
|
||||
duration_predictor_output_size: 256 #the output size of duration predictior.
|
||||
duration_predictor_filter_size: 3 #the filter size of conv1d in duration prediction.
|
||||
fft_conv1d_filter: 3 #the filter size of conv1d in fft.
|
||||
fft_conv1d_padding: 1 #the padding size of conv1d in fft.
|
||||
dropout: 0.1 #the dropout in network.
|
||||
outputs_per_step: 1
|
||||
|
||||
train:
|
||||
batch_size: 32
|
||||
learning_rate: 0.001
|
||||
warm_up_step: 4000 #the warm up step of learning rate.
|
||||
grad_clip_thresh: 0.1 #the threshold of grad clip.
|
||||
|
||||
checkpoint_interval: 1000
|
||||
max_iteration: 500000
|
||||
|
|
@ -1,186 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import librosa
|
||||
import csv
|
||||
import pickle
|
||||
|
||||
from paddle import fluid
|
||||
from parakeet import g2p
|
||||
from parakeet import audio
|
||||
from parakeet.data.sampler import *
|
||||
from parakeet.data.datacargo import DataCargo
|
||||
from parakeet.data.batch import TextIDBatcher, SpecBatcher
|
||||
from parakeet.data.dataset import DatasetMixin, TransformDataset, CacheDataset, SliceDataset
|
||||
from parakeet.models.transformer_tts.utils import *
|
||||
|
||||
|
||||
class LJSpeechLoader:
|
||||
def __init__(self,
|
||||
config,
|
||||
place,
|
||||
data_path,
|
||||
alignments_path,
|
||||
batch_size,
|
||||
nranks,
|
||||
rank,
|
||||
is_vocoder=False,
|
||||
shuffle=True):
|
||||
|
||||
LJSPEECH_ROOT = Path(data_path)
|
||||
metadata = LJSpeechMetaData(LJSPEECH_ROOT, alignments_path)
|
||||
transformer = LJSpeech(config)
|
||||
dataset = TransformDataset(metadata, transformer)
|
||||
dataset = CacheDataset(dataset)
|
||||
|
||||
sampler = DistributedSampler(
|
||||
len(dataset), nranks, rank, shuffle=shuffle)
|
||||
|
||||
assert batch_size % nranks == 0
|
||||
each_bs = batch_size // nranks
|
||||
dataloader = DataCargo(
|
||||
dataset,
|
||||
sampler=sampler,
|
||||
batch_size=each_bs,
|
||||
shuffle=shuffle,
|
||||
batch_fn=batch_examples,
|
||||
drop_last=True)
|
||||
self.reader = fluid.io.DataLoader.from_generator(
|
||||
capacity=32,
|
||||
iterable=True,
|
||||
use_double_buffer=True,
|
||||
return_list=True)
|
||||
self.reader.set_batch_generator(dataloader, place)
|
||||
|
||||
|
||||
class LJSpeechMetaData(DatasetMixin):
|
||||
def __init__(self, root, alignments_path):
|
||||
self.root = Path(root)
|
||||
self._wav_dir = self.root.joinpath("wavs")
|
||||
csv_path = self.root.joinpath("metadata.csv")
|
||||
self._table = pd.read_csv(
|
||||
csv_path,
|
||||
sep="|",
|
||||
header=None,
|
||||
quoting=csv.QUOTE_NONE,
|
||||
names=["fname", "raw_text", "normalized_text"])
|
||||
with open(alignments_path, "rb") as f:
|
||||
self._alignments = pickle.load(f)
|
||||
|
||||
def get_example(self, i):
|
||||
fname, raw_text, normalized_text = self._table.iloc[i]
|
||||
alignment = self._alignments[fname]
|
||||
fname = str(self._wav_dir.joinpath(fname + ".wav"))
|
||||
return fname, normalized_text, alignment
|
||||
|
||||
def __len__(self):
|
||||
return len(self._table)
|
||||
|
||||
|
||||
class LJSpeech(object):
|
||||
def __init__(self, cfg):
|
||||
super(LJSpeech, self).__init__()
|
||||
self.sr = cfg['sr']
|
||||
self.n_fft = cfg['n_fft']
|
||||
self.num_mels = cfg['num_mels']
|
||||
self.win_length = cfg['win_length']
|
||||
self.hop_length = cfg['hop_length']
|
||||
self.preemphasis = cfg['preemphasis']
|
||||
self.fmin = cfg['fmin']
|
||||
self.fmax = cfg['fmax']
|
||||
|
||||
def __call__(self, metadatum):
|
||||
"""All the code for generating an Example from a metadatum. If you want a
|
||||
different preprocessing pipeline, you can override this method.
|
||||
This method may require several processor, each of which has a lot of options.
|
||||
In this case, you'd better pass a composed transform and pass it to the init
|
||||
method.
|
||||
"""
|
||||
fname, normalized_text, alignment = metadatum
|
||||
|
||||
wav, _ = librosa.load(str(fname))
|
||||
spec = librosa.stft(
|
||||
y=wav,
|
||||
n_fft=self.n_fft,
|
||||
win_length=self.win_length,
|
||||
hop_length=self.hop_length)
|
||||
mag = np.abs(spec)
|
||||
mel = librosa.filters.mel(self.sr,
|
||||
self.n_fft,
|
||||
n_mels=self.num_mels,
|
||||
fmin=self.fmin,
|
||||
fmax=self.fmax)
|
||||
mel = np.matmul(mel, mag)
|
||||
mel = np.log(np.maximum(mel, 1e-5))
|
||||
phonemes = np.array(
|
||||
g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
|
||||
return (mel, phonemes, alignment
|
||||
) # maybe we need to implement it as a map in the future
|
||||
|
||||
|
||||
def batch_examples(batch):
|
||||
texts = []
|
||||
mels = []
|
||||
text_lens = []
|
||||
pos_texts = []
|
||||
pos_mels = []
|
||||
alignments = []
|
||||
for data in batch:
|
||||
mel, text, alignment = data
|
||||
text_lens.append(len(text))
|
||||
pos_texts.append(np.arange(1, len(text) + 1))
|
||||
pos_mels.append(np.arange(1, mel.shape[1] + 1))
|
||||
mels.append(mel)
|
||||
texts.append(text)
|
||||
alignments.append(alignment)
|
||||
|
||||
# Sort by text_len in descending order
|
||||
texts = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(texts, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
mels = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(mels, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
pos_texts = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
pos_mels = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
alignments = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(alignments, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
#text_lens = sorted(text_lens, reverse=True)
|
||||
|
||||
# Pad sequence with largest len of the batch
|
||||
texts = TextIDBatcher(pad_id=0)(texts) #(B, T)
|
||||
pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
|
||||
pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
|
||||
alignments = TextIDBatcher(pad_id=0)(alignments).astype(np.float32)
|
||||
mels = np.transpose(
|
||||
SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) #(B,T,num_mels)
|
||||
|
||||
return (texts, mels, pos_texts, pos_mels, alignments)
|
Binary file not shown.
Before Width: | Height: | Size: 513 KiB |
|
@ -1,170 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
from visualdl import LogWriter
|
||||
from scipy.io.wavfile import write
|
||||
from collections import OrderedDict
|
||||
import argparse
|
||||
from pprint import pprint
|
||||
from ruamel import yaml
|
||||
from matplotlib import cm
|
||||
import numpy as np
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
from parakeet.g2p.en import text_to_sequence
|
||||
from parakeet import audio
|
||||
from parakeet.models.fastspeech.fastspeech import FastSpeech
|
||||
from parakeet.models.transformer_tts.utils import *
|
||||
from parakeet.models.wavenet import WaveNet, UpsampleNet
|
||||
from parakeet.models.clarinet import STFT, Clarinet, ParallelWaveNet
|
||||
from parakeet.modules import weight_norm
|
||||
from parakeet.models.waveflow import WaveFlowModule
|
||||
from parakeet.utils.layer_tools import freeze
|
||||
from parakeet.utils import io
|
||||
|
||||
|
||||
def add_config_options_to_parser(parser):
|
||||
parser.add_argument("--config", type=str, help="path of the config file")
|
||||
parser.add_argument(
|
||||
"--vocoder",
|
||||
type=str,
|
||||
default="griffin-lim",
|
||||
choices=['griffin-lim', 'waveflow'],
|
||||
help="vocoder method")
|
||||
parser.add_argument(
|
||||
"--config_vocoder", type=str, help="path of the vocoder config file")
|
||||
parser.add_argument("--use_gpu", type=int, default=0, help="device to use")
|
||||
parser.add_argument(
|
||||
"--alpha",
|
||||
type=float,
|
||||
default=1,
|
||||
help="determine the length of the expanded sequence mel, controlling the voice speed."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--checkpoint", type=str, help="fastspeech checkpoint for synthesis")
|
||||
parser.add_argument(
|
||||
"--checkpoint_vocoder",
|
||||
type=str,
|
||||
help="vocoder checkpoint for synthesis")
|
||||
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="synthesis",
|
||||
help="path to save experiment results")
|
||||
|
||||
|
||||
def synthesis(text_input, args):
|
||||
local_rank = dg.parallel.Env().local_rank
|
||||
place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())
|
||||
fluid.enable_dygraph(place)
|
||||
|
||||
with open(args.config) as f:
|
||||
cfg = yaml.load(f, Loader=yaml.Loader)
|
||||
|
||||
# tensorboard
|
||||
if not os.path.exists(args.output):
|
||||
os.mkdir(args.output)
|
||||
|
||||
writer = LogWriter(os.path.join(args.output, 'log'))
|
||||
|
||||
model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels'])
|
||||
# Load parameters.
|
||||
global_step = io.load_parameters(
|
||||
model=model, checkpoint_path=args.checkpoint)
|
||||
model.eval()
|
||||
|
||||
text = np.asarray(text_to_sequence(text_input))
|
||||
text = np.expand_dims(text, axis=0)
|
||||
pos_text = np.arange(1, text.shape[1] + 1)
|
||||
pos_text = np.expand_dims(pos_text, axis=0)
|
||||
|
||||
text = dg.to_variable(text).astype(np.int64)
|
||||
pos_text = dg.to_variable(pos_text).astype(np.int64)
|
||||
|
||||
_, mel_output_postnet = model(text, pos_text, alpha=args.alpha)
|
||||
|
||||
if args.vocoder == 'griffin-lim':
|
||||
#synthesis use griffin-lim
|
||||
wav = synthesis_with_griffinlim(mel_output_postnet, cfg['audio'])
|
||||
elif args.vocoder == 'waveflow':
|
||||
wav = synthesis_with_waveflow(mel_output_postnet, args,
|
||||
args.checkpoint_vocoder, place)
|
||||
else:
|
||||
print(
|
||||
'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
|
||||
% args.vocoder)
|
||||
|
||||
writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0,
|
||||
cfg['audio']['sr'])
|
||||
if not os.path.exists(os.path.join(args.output, 'samples')):
|
||||
os.mkdir(os.path.join(args.output, 'samples'))
|
||||
write(
|
||||
os.path.join(
|
||||
os.path.join(args.output, 'samples'), args.vocoder + '.wav'),
|
||||
cfg['audio']['sr'], wav)
|
||||
print("Synthesis completed !!!")
|
||||
writer.close()
|
||||
|
||||
|
||||
def synthesis_with_griffinlim(mel_output, cfg):
|
||||
mel_output = fluid.layers.transpose(
|
||||
fluid.layers.squeeze(mel_output, [0]), [1, 0])
|
||||
mel_output = np.exp(mel_output.numpy())
|
||||
basis = librosa.filters.mel(cfg['sr'],
|
||||
cfg['n_fft'],
|
||||
cfg['num_mels'],
|
||||
fmin=cfg['fmin'],
|
||||
fmax=cfg['fmax'])
|
||||
inv_basis = np.linalg.pinv(basis)
|
||||
spec = np.maximum(1e-10, np.dot(inv_basis, mel_output))
|
||||
|
||||
wav = librosa.core.griffinlim(
|
||||
spec**cfg['power'],
|
||||
hop_length=cfg['hop_length'],
|
||||
win_length=cfg['win_length'])
|
||||
|
||||
return wav
|
||||
|
||||
|
||||
def synthesis_with_waveflow(mel_output, args, checkpoint, place):
|
||||
|
||||
fluid.enable_dygraph(place)
|
||||
args.config = args.config_vocoder
|
||||
args.use_fp16 = False
|
||||
config = io.add_yaml_config_to_args(args)
|
||||
|
||||
mel_spectrogram = fluid.layers.transpose(mel_output, [0, 2, 1])
|
||||
|
||||
# Build model.
|
||||
waveflow = WaveFlowModule(config)
|
||||
io.load_parameters(model=waveflow, checkpoint_path=checkpoint)
|
||||
for layer in waveflow.sublayers():
|
||||
if isinstance(layer, weight_norm.WeightNormWrapper):
|
||||
layer.remove_weight_norm()
|
||||
|
||||
# Run model inference.
|
||||
wav = waveflow.synthesize(mel_spectrogram, sigma=config.sigma)
|
||||
return wav.numpy()[0]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Synthesis model")
|
||||
add_config_options_to_parser(parser)
|
||||
args = parser.parse_args()
|
||||
pprint(vars(args))
|
||||
synthesis(
|
||||
"Don't argue with the people of strong determination, because they may change the fact!",
|
||||
args)
|
|
@ -1,20 +0,0 @@
|
|||
# train model
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0 \
|
||||
python -u synthesis.py \
|
||||
--use_gpu=1 \
|
||||
--alpha=1.0 \
|
||||
--checkpoint='./fastspeech_ljspeech_ckpt_1.0/fastspeech/step-162000' \
|
||||
--config='fastspeech_ljspeech_ckpt_1.0/ljspeech.yaml' \
|
||||
--output='./synthesis' \
|
||||
--vocoder='waveflow' \
|
||||
--config_vocoder='./waveflow_res128_ljspeech_ckpt_1.0/waveflow_ljspeech.yaml' \
|
||||
--checkpoint_vocoder='./waveflow_res128_ljspeech_ckpt_1.0/step-2000000' \
|
||||
|
||||
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in synthesis!"
|
||||
exit 1
|
||||
fi
|
||||
exit 0
|
|
@ -1,166 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import numpy as np
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
import math
|
||||
from pathlib import Path
|
||||
from pprint import pprint
|
||||
from ruamel import yaml
|
||||
from tqdm import tqdm
|
||||
from matplotlib import cm
|
||||
from collections import OrderedDict
|
||||
from visualdl import LogWriter
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.models.fastspeech.fastspeech import FastSpeech
|
||||
from parakeet.models.fastspeech.utils import get_alignment
|
||||
from data import LJSpeechLoader
|
||||
from parakeet.utils import io
|
||||
|
||||
|
||||
def add_config_options_to_parser(parser):
|
||||
parser.add_argument("--config", type=str, help="path of the config file")
|
||||
parser.add_argument("--use_gpu", type=int, default=0, help="device to use")
|
||||
parser.add_argument("--data", type=str, help="path of LJspeech dataset")
|
||||
parser.add_argument(
|
||||
"--alignments_path", type=str, help="path of alignments")
|
||||
|
||||
g = parser.add_mutually_exclusive_group()
|
||||
g.add_argument("--checkpoint", type=str, help="checkpoint to resume from")
|
||||
g.add_argument(
|
||||
"--iteration",
|
||||
type=int,
|
||||
help="the iteration of the checkpoint to load from output directory")
|
||||
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="experiment",
|
||||
help="path to save experiment results")
|
||||
|
||||
|
||||
def main(args):
|
||||
local_rank = dg.parallel.Env().local_rank
|
||||
nranks = dg.parallel.Env().nranks
|
||||
parallel = nranks > 1
|
||||
|
||||
with open(args.config) as f:
|
||||
cfg = yaml.load(f, Loader=yaml.Loader)
|
||||
|
||||
global_step = 0
|
||||
place = fluid.CUDAPlace(dg.parallel.Env()
|
||||
.dev_id) if args.use_gpu else fluid.CPUPlace()
|
||||
fluid.enable_dygraph(place)
|
||||
|
||||
if not os.path.exists(args.output):
|
||||
os.mkdir(args.output)
|
||||
|
||||
writer = LogWriter(os.path.join(args.output,
|
||||
'log')) if local_rank == 0 else None
|
||||
|
||||
model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels'])
|
||||
model.train()
|
||||
optimizer = fluid.optimizer.AdamOptimizer(
|
||||
learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] *
|
||||
(cfg['train']['learning_rate']**2)),
|
||||
cfg['train']['warm_up_step']),
|
||||
parameter_list=model.parameters(),
|
||||
grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][
|
||||
'grad_clip_thresh']))
|
||||
reader = LJSpeechLoader(
|
||||
cfg['audio'],
|
||||
place,
|
||||
args.data,
|
||||
args.alignments_path,
|
||||
cfg['train']['batch_size'],
|
||||
nranks,
|
||||
local_rank,
|
||||
shuffle=True).reader
|
||||
iterator = iter(tqdm(reader))
|
||||
|
||||
# Load parameters.
|
||||
global_step = io.load_parameters(
|
||||
model=model,
|
||||
optimizer=optimizer,
|
||||
checkpoint_dir=os.path.join(args.output, 'checkpoints'),
|
||||
iteration=args.iteration,
|
||||
checkpoint_path=args.checkpoint)
|
||||
print("Rank {}: checkpoint loaded.".format(local_rank))
|
||||
|
||||
if parallel:
|
||||
strategy = dg.parallel.prepare_context()
|
||||
model = fluid.dygraph.parallel.DataParallel(model, strategy)
|
||||
|
||||
while global_step <= cfg['train']['max_iteration']:
|
||||
try:
|
||||
batch = next(iterator)
|
||||
except StopIteration as e:
|
||||
iterator = iter(tqdm(reader))
|
||||
batch = next(iterator)
|
||||
|
||||
(character, mel, pos_text, pos_mel, alignment) = batch
|
||||
|
||||
global_step += 1
|
||||
|
||||
#Forward
|
||||
result = model(
|
||||
character, pos_text, mel_pos=pos_mel, length_target=alignment)
|
||||
mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
|
||||
mel_loss = layers.mse_loss(mel_output, mel)
|
||||
mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
|
||||
duration_loss = layers.mean(
|
||||
layers.abs(
|
||||
layers.elementwise_sub(duration_predictor_output, alignment)))
|
||||
total_loss = mel_loss + mel_postnet_loss + duration_loss
|
||||
|
||||
if local_rank == 0:
|
||||
writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
|
||||
writer.add_scalar('post_mel_loss',
|
||||
mel_postnet_loss.numpy(), global_step)
|
||||
writer.add_scalar('duration_loss',
|
||||
duration_loss.numpy(), global_step)
|
||||
writer.add_scalar('learning_rate',
|
||||
optimizer._learning_rate.step().numpy(),
|
||||
global_step)
|
||||
|
||||
if parallel:
|
||||
total_loss = model.scale_loss(total_loss)
|
||||
total_loss.backward()
|
||||
model.apply_collective_grads()
|
||||
else:
|
||||
total_loss.backward()
|
||||
optimizer.minimize(total_loss)
|
||||
model.clear_gradients()
|
||||
|
||||
# save checkpoint
|
||||
if local_rank == 0 and global_step % cfg['train'][
|
||||
'checkpoint_interval'] == 0:
|
||||
io.save_parameters(
|
||||
os.path.join(args.output, 'checkpoints'), global_step, model,
|
||||
optimizer)
|
||||
|
||||
if local_rank == 0:
|
||||
writer.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Train Fastspeech model")
|
||||
add_config_options_to_parser(parser)
|
||||
args = parser.parse_args()
|
||||
# Print the whole config setting.
|
||||
pprint(vars(args))
|
||||
main(args)
|
|
@ -1,15 +0,0 @@
|
|||
# train model
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
python -u train.py \
|
||||
--use_gpu=1 \
|
||||
--data='../../dataset/LJSpeech-1.1' \
|
||||
--alignments_path='./alignments/alignments.pkl' \
|
||||
--output='./experiment' \
|
||||
--config='configs/ljspeech.yaml' \
|
||||
#--checkpoint='./checkpoint/fastspeech/step-120000' \
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in training!"
|
||||
exit 1
|
||||
fi
|
||||
exit 0
|
|
@ -0,0 +1,129 @@
|
|||
# Speaker Encoder
|
||||
|
||||
This experiment trains a speaker encoder with speaker verification as its task. It is done as a part of the experiment of transfer learning from speaker verification to multispeaker text-to-speech synthesis, which can be found at [tacotron2_aishell3](../tacotron2_shell3). The trained speaker encoder is used to extract utterance embeddings from utterances.
|
||||
|
||||
## Model
|
||||
|
||||
The model used in this experiment is the speaker encoder with text independent speaker verification task in [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf). GE2E-softmax loss is used.
|
||||
|
||||
## File Structure
|
||||
|
||||
```text
|
||||
ge2e
|
||||
├── README.md
|
||||
├── README_cn.md
|
||||
├── audio_processor.py
|
||||
├── config.py
|
||||
├── dataset_processors.py
|
||||
├── inference.py
|
||||
├── preprocess.py
|
||||
├── random_cycle.py
|
||||
├── speaker_verification_dataset.py
|
||||
└── train.py
|
||||
```
|
||||
|
||||
## Download Datasets
|
||||
|
||||
Currently supported datasets are Librispeech-other-500, VoxCeleb, VoxCeleb2,ai-datatang-200zh, magicdata, which can be downloaded from corresponding webpage.
|
||||
|
||||
1. Librispeech/train-other-500
|
||||
|
||||
An English multispeaker dataset,[URL](https://www.openslr.org/resources/12/train-other-500.tar.gz),only the `train-other-500` subset is used.
|
||||
|
||||
2. VoxCeleb1
|
||||
|
||||
An English multispeaker dataset,[URL](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html) , Audio Files from Dev A to Dev D should be downloaded, combined and extracted.
|
||||
|
||||
3. VoxCeleb2
|
||||
|
||||
An English multispeaker dataset,[URL](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html) , Audio Files from Dev A to Dev H should be downloaded, combined and extracted.
|
||||
|
||||
4. Aidatatang-200zh
|
||||
|
||||
A Mandarin Chinese multispeaker dataset ,[URL](https://www.openslr.org/62/) .
|
||||
|
||||
5. magicdata
|
||||
|
||||
A Mandarin Chinese multispeaker dataset ,[URL](https://www.openslr.org/68/) .
|
||||
|
||||
If you want to use other datasets, you can also download and preprocess it as long as it meets the requirements described below.
|
||||
|
||||
## Preprocess Datasets
|
||||
|
||||
Multispeaker datasets are used as training data, though the transcriptions are not used. To enlarge the amount of data used for training, several multispeaker datasets are combined. The preporcessed datasets are organized in a file structure described below. The mel spectrogram of each utterance is save in `.npy` format. The dataset is 2-stratified (speaker-utterance). Since multiple datasets are combined, to avoid conflict in speaker id, dataset name is prepended to the speake ids.
|
||||
|
||||
```text
|
||||
dataset_root
|
||||
├── dataset01_speaker01/
|
||||
│ ├── utterance01.npy
|
||||
│ ├── utterance02.npy
|
||||
│ └── utterance03.npy
|
||||
├── dataset01_speaker02/
|
||||
│ ├── utterance01.npy
|
||||
│ ├── utterance02.npy
|
||||
│ └── utterance03.npy
|
||||
├── dataset02_speaker01/
|
||||
│ ├── utterance01.npy
|
||||
│ ├── utterance02.npy
|
||||
│ └── utterance03.npy
|
||||
└── dataset02_speaker02/
|
||||
├── utterance01.npy
|
||||
├── utterance02.npy
|
||||
└── utterance03.npy
|
||||
```
|
||||
|
||||
Run the command to preprocess datasets.
|
||||
|
||||
```bash
|
||||
python preprocess.py --datasets_root=<datasets_root> --output_dir=<output_dir> --dataset_names=<dataset_names>
|
||||
```
|
||||
|
||||
Here `--datasets_root` is the directory that contains several extracted dataset; `--output_dir` is the directory to save the preprocessed dataset; `--dataset_names` is the dataset to preprocess. If there are multiple datasets in `--datasets_root` to preprocess, the names can be joined with comma. Currently supported dataset names are librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh and magicdata.
|
||||
|
||||
## Training
|
||||
|
||||
When preprocessing is done, run the command below to train the mdoel.
|
||||
|
||||
```bash
|
||||
python train.py --data=<data_path> --output=<output> --device="gpu" --nprocs=1
|
||||
```
|
||||
|
||||
- `--data` is the path to the preprocessed dataset.
|
||||
- `--output` is the directory to save results,usually a subdirectory of `runs`.It contains visualdl log files, text log files, config file and a `checkpoints` directory, which contains parameter file and optimizer state file. If `--output` already has some training results in it, the most recent parameter file and optimizer state file is loaded before training.
|
||||
- `--device` is the device type to run the training, 'cpu' and 'gpu' are supported.
|
||||
- `--nprocs` is the number of replicas to run in multiprocessing based parallel training。Currently multiprocessing based parallel training is only enabled when using 'gpu' as the devicde. `CUDA_VISIBLE_DEVICES` can be used to specify visible devices with cuda.
|
||||
|
||||
Other options are described below.
|
||||
|
||||
- `--config` is a `.yaml` config file used to override the default config(which is coded in `config.py`).
|
||||
- `--opts` is command line options to further override config files. It should be the last comman line options passed with multiple key-value pairs separated by spaces.
|
||||
- `--checkpoint_path` specifies the checkpoiont to load before training, extension is not included. A parameter file ( `.pdparams`) and an optimizer state file ( `.pdopt`) with the same name is used. This option has a higher priority than auto-resuming from the `--output` directory.
|
||||
|
||||
## Pretrained Model
|
||||
|
||||
The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps.
|
||||
|
||||
Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip).
|
||||
|
||||
## Inference
|
||||
|
||||
When training is done, run the command below to generate utterance embedding for each utterance in a dataset.
|
||||
|
||||
```bash
|
||||
python inference.py --input=<input> --output=<output> --checkpoint_path=<checkpoint_path> --device="gpu"
|
||||
```
|
||||
|
||||
`--input` is the path of the dataset used for inference.
|
||||
|
||||
`--output` is the directory to save the processed results. It has the same file structure as the input dataset. Each utterance in the dataset has a corrsponding utterance embedding file in `*.npy` format.
|
||||
|
||||
`--checkpoint_path` is the path of the checkpoint to use, extension not included.
|
||||
|
||||
`--pattern` is the wildcard pattern to filter audio files for inference, defaults to `*.wav`.
|
||||
|
||||
`--device` and `--opts` have the same meaning as in the training script.
|
||||
|
||||
## References
|
||||
|
||||
1. [Generalized End-to-end Loss for Speaker Verification](https://arxiv.org/pdf/1710.10467.pdf)
|
||||
2. [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf)
|
|
@ -0,0 +1,124 @@
|
|||
# Speaker Encoder
|
||||
|
||||
本实验是的在多说话人数据集上以 Speaker Verification 为任务训练一个 speaker encoder, 这是作为 transfer learning from speaker verification to multispeaker text-to-speech synthesis 实验的一部分, 可以在 [tacotron2_aishell3](../tacotron2_aishell3) 中找到。用训练好的模型来提取音频的 utterance embedding.
|
||||
|
||||
## 模型
|
||||
|
||||
本实验使用的模型是 [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf) 中的 speaker encoder text independent 模型。使用的是 GE2E softmax 损失函数。
|
||||
|
||||
## 目录结构
|
||||
|
||||
```text
|
||||
ge2e
|
||||
├── README_cn.md
|
||||
├── audio_processor.py
|
||||
├── config.py
|
||||
├── dataset_processors.py
|
||||
├── inference.py
|
||||
├── preprocess.py
|
||||
├── random_cycle.py
|
||||
├── speaker_verification_dataset.py
|
||||
└── train.py
|
||||
```
|
||||
|
||||
## 数据集下载
|
||||
|
||||
本实验支持了 Librispeech-other-500, VoxCeleb, VoxCeleb2,ai-datatang-200zh, magicdata 数据集。可以在对应的页面下载。
|
||||
|
||||
1. Librispeech/train-other-500
|
||||
|
||||
英文多说话人数据集,[下载链接](https://www.openslr.org/resources/12/train-other-500.tar.gz),我们的实验中仅用到了 train-other-500 这个子集。
|
||||
|
||||
2. VoxCeleb1
|
||||
|
||||
英文多说话人数据集,[下载链接](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html),需要下载其中的 Audio Files 中的 Dev A 到 Dev D 四个压缩文件并合并解压。
|
||||
|
||||
3. VoxCeleb2
|
||||
|
||||
英文多说话人数据集,[下载链接](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox2.html),需要下载其中的 Audio Files 中的 Dev A 到 Dev H 八个压缩文件并合并解压。
|
||||
|
||||
4. Aidatatang-200zh
|
||||
|
||||
中文多说话人数据集,[下载链接](https://www.openslr.org/62/)。
|
||||
|
||||
5. magicdata
|
||||
|
||||
中文多说话人数据集,[下载链接](https://www.openslr.org/68/)。
|
||||
|
||||
如果用户需要使用其他的数据集,也可以自行下载并进行数据处理,只要符合如下的要求。
|
||||
|
||||
## 数据集预处理
|
||||
|
||||
训练中使用的数据集是多说话人数据集,transcription 并不会被使用。为了扩大数据的量,训练过程可以将多个数据集合并为一个。处理后的文件结果组织方式如下,每个句子的频谱存储为 `.npy` 格式。以 speaker-utterance 的两层目录结构存储。因为合并数据集的原因,为了避免 speaker id 冲突,dataset 名会被添加到 speaker id 前面。
|
||||
|
||||
```text
|
||||
dataset_root
|
||||
├── dataset01_speaker01/
|
||||
│ ├── utterance01.npy
|
||||
│ ├── utterance02.npy
|
||||
│ └── utterance03.npy
|
||||
├── dataset01_speaker02/
|
||||
│ ├── utterance01.npy
|
||||
│ ├── utterance02.npy
|
||||
│ └── utterance03.npy
|
||||
├── dataset02_speaker01/
|
||||
│ ├── utterance01.npy
|
||||
│ ├── utterance02.npy
|
||||
│ └── utterance03.npy
|
||||
└── dataset02_speaker02/
|
||||
├── utterance01.npy
|
||||
├── utterance02.npy
|
||||
└── utterance03.npy
|
||||
```
|
||||
|
||||
运行数据处理脚本
|
||||
|
||||
```bash
|
||||
python preprocess.py --datasets_root=<datasets_root> --output_dir=<output_dir> --dataset_names=<dataset_names>
|
||||
```
|
||||
|
||||
其中 datasets_root 是包含多个原始数据集的路径,--output_dir 是多个数据集合并后输出的路径,dataset_names 是数据集的名称,多个数据集可以用逗号分割,比如 'librispeech_other, voxceleb1'. 目前支持的数据集有 librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh, magicdata.
|
||||
|
||||
## 训练
|
||||
|
||||
数据处理完成后,使用如下的脚本训练。
|
||||
|
||||
```bash
|
||||
python train.py --data=<data_path> --output=<output> --device="gpu" --nprocs=1
|
||||
```
|
||||
|
||||
- `--data` 是处理后的数据集路径。
|
||||
- `--output` 是训练结果的保存路径,一般使用 runs 下的一个子目录。保存结果包含 visualdl 的 log 文件,文本 log 记录,运行 config 备份,以及 checkpoints 目录,里面包含参数文件和优化器状态文件。如果指定的 output 路径包含此前的训练结果,训练前会自动加载最近的参数文件和优化器状态文件。
|
||||
- `--device` 是运行设备,目前支持 'cpu' 和 'gpu'.
|
||||
- `--nprocs` 是指定运行进程数。目前仅在使用 'gpu' 是支持多进程训练。可以配合 `CUDA_VISIBLE_DEVICES` 环境变量指定可见卡号。
|
||||
|
||||
另外还有几个选项。
|
||||
|
||||
- `--config` 是用于覆盖默认配置(默认配置可以查看 `config.py`) 的配置文件,为 `.yaml` 文件。
|
||||
- `--opts` 是用命令行参数进一步覆盖配置。这是最后一个传入的命令行选项,用多组空格分隔的 KEY VALUE 对的方式传入。
|
||||
- `--checkpoint_path` 指定从中恢复的 checkpoint, 不需要包含扩展名。同名的参数文件( `.pdparams`) 和优化器文件( `.pdopt`)会被加载以恢复训练。这个参数指定的恢复训练优先级高于自动从 `output` 文件夹中恢复训练。
|
||||
|
||||
## 预训练模型
|
||||
|
||||
预训练模型是在 Librispeech-other-500 和 voxceleb1 上训练到 1560k steps 后用 aidatatang_200h 和 magic_data 训练到 3000k 的结果。
|
||||
|
||||
下载链接 [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
|
||||
|
||||
## 预测
|
||||
|
||||
使用训练好的模型进行预测,对一个数据集中的所有 utterance 生成一个 embedding.
|
||||
|
||||
```bash
|
||||
python inference.py --input=<input> --output=<output> --checkpoint_path=<checkpoint_path> --device="gpu"
|
||||
```
|
||||
|
||||
- `--input` 是需要处理的数据集的路径。
|
||||
- `--output` 是处理的结果,它会保持和 `--input` 相同的文件夹结构,对应 input 中的每一个音频文件会有一个同名的 `*.npy` 文件,是从这个音频文件中提取到的 utterance embedding.
|
||||
- `--checkpoint_path` 为用于预测的参数文件路径,不包含扩展名。
|
||||
- `--pattern` 是用于筛选数据集中需要处理的音频文件的通配符模式,默认为 `*.wav`.
|
||||
- `--device` 和 `--opts` 的语义和训练脚本一致。
|
||||
|
||||
## 参考文献
|
||||
|
||||
1. [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf)
|
||||
2. [Transfer Learning from Speaker Verification toMultispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf)
|
|
@ -0,0 +1,237 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from pathlib import Path
|
||||
from warnings import warn
|
||||
import struct
|
||||
|
||||
from scipy.ndimage.morphology import binary_dilation
|
||||
import numpy as np
|
||||
import librosa
|
||||
|
||||
try:
|
||||
import webrtcvad
|
||||
except ModuleNotFoundError:
|
||||
warn("Unable to import 'webrtcvad'."
|
||||
"This package enables noise removal and is recommended.")
|
||||
webrtcvad = None
|
||||
|
||||
INT16_MAX = (2**15) - 1
|
||||
|
||||
|
||||
def normalize_volume(wav,
|
||||
target_dBFS,
|
||||
increase_only=False,
|
||||
decrease_only=False):
|
||||
# this function implements Loudness normalization, instead of peak
|
||||
# normalization, See https://en.wikipedia.org/wiki/Audio_normalization
|
||||
# dBFS: Decibels relative to full scale
|
||||
# See https://en.wikipedia.org/wiki/DBFS for more details
|
||||
# for 16Bit PCM audio, minimal level is -96dB
|
||||
# compute the mean dBFS and adjust to target dBFS, with by increasing
|
||||
# or decreasing
|
||||
if increase_only and decrease_only:
|
||||
raise ValueError("Both increase only and decrease only are set")
|
||||
dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))
|
||||
if ((dBFS_change < 0 and increase_only) or
|
||||
(dBFS_change > 0 and decrease_only)):
|
||||
return wav
|
||||
gain = 10**(dBFS_change / 20)
|
||||
return wav * gain
|
||||
|
||||
|
||||
def trim_long_silences(wav,
|
||||
vad_window_length: int,
|
||||
vad_moving_average_width: int,
|
||||
vad_max_silence_length: int,
|
||||
sampling_rate: int):
|
||||
"""
|
||||
Ensures that segments without voice in the waveform remain no longer than a
|
||||
threshold determined by the VAD parameters in params.py.
|
||||
|
||||
:param wav: the raw waveform as a numpy array of floats
|
||||
:return: the same waveform with silences trimmed away (length <= original wav length)
|
||||
"""
|
||||
# Compute the voice detection window size
|
||||
samples_per_window = (vad_window_length * sampling_rate) // 1000
|
||||
|
||||
# Trim the end of the audio to have a multiple of the window size
|
||||
wav = wav[:len(wav) - (len(wav) % samples_per_window)]
|
||||
|
||||
# Convert the float waveform to 16-bit mono PCM
|
||||
pcm_wave = struct.pack("%dh" % len(wav),
|
||||
*(np.round(wav * INT16_MAX)).astype(np.int16))
|
||||
|
||||
# Perform voice activation detection
|
||||
voice_flags = []
|
||||
vad = webrtcvad.Vad(mode=3)
|
||||
for window_start in range(0, len(wav), samples_per_window):
|
||||
window_end = window_start + samples_per_window
|
||||
voice_flags.append(
|
||||
vad.is_speech(
|
||||
pcm_wave[window_start * 2:window_end * 2],
|
||||
sample_rate=sampling_rate))
|
||||
voice_flags = np.array(voice_flags)
|
||||
|
||||
# Smooth the voice detection with a moving average
|
||||
def moving_average(array, width):
|
||||
array_padded = np.concatenate((np.zeros((width - 1) // 2), array,
|
||||
np.zeros(width // 2)))
|
||||
ret = np.cumsum(array_padded, dtype=float)
|
||||
ret[width:] = ret[width:] - ret[:-width]
|
||||
return ret[width - 1:] / width
|
||||
|
||||
audio_mask = moving_average(voice_flags, vad_moving_average_width)
|
||||
audio_mask = np.round(audio_mask).astype(np.bool)
|
||||
|
||||
# Dilate the voiced regions
|
||||
audio_mask = binary_dilation(audio_mask,
|
||||
np.ones(vad_max_silence_length + 1))
|
||||
audio_mask = np.repeat(audio_mask, samples_per_window)
|
||||
|
||||
return wav[audio_mask]
|
||||
|
||||
|
||||
def compute_partial_slices(n_samples: int,
|
||||
partial_utterance_n_frames: int,
|
||||
hop_length: int,
|
||||
min_pad_coverage: float=0.75,
|
||||
overlap: float=0.5):
|
||||
"""
|
||||
Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
|
||||
partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
|
||||
spectrogram slices are returned, so as to make each partial utterance waveform correspond to
|
||||
its spectrogram. This function assumes that the mel spectrogram parameters used are those
|
||||
defined in params_data.py.
|
||||
|
||||
The returned ranges may be indexing further than the length of the waveform. It is
|
||||
recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
|
||||
|
||||
:param n_samples: the number of samples in the waveform
|
||||
:param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
|
||||
utterance
|
||||
:param min_pad_coverage: when reaching the last partial utterance, it may or may not have
|
||||
enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
|
||||
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
|
||||
it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
|
||||
utterance, this parameter is ignored so that the function always returns at least 1 slice.
|
||||
:param overlap: by how much the partial utterance should overlap. If set to 0, the partial
|
||||
utterances are entirely disjoint.
|
||||
:return: the waveform slices and mel spectrogram slices as lists of array slices. Index
|
||||
respectively the waveform and the mel spectrogram with these slices to obtain the partial
|
||||
utterances.
|
||||
"""
|
||||
assert 0 <= overlap < 1
|
||||
assert 0 < min_pad_coverage <= 1
|
||||
|
||||
# librosa's function to compute num_frames from num_samples
|
||||
n_frames = int(np.ceil((n_samples + 1) / hop_length))
|
||||
# frame shift between ajacent partials
|
||||
frame_step = max(
|
||||
1, int(np.round(partial_utterance_n_frames * (1 - overlap))))
|
||||
|
||||
# Compute the slices
|
||||
wav_slices, mel_slices = [], []
|
||||
steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
|
||||
for i in range(0, steps, frame_step):
|
||||
mel_range = np.array([i, i + partial_utterance_n_frames])
|
||||
wav_range = mel_range * hop_length
|
||||
mel_slices.append(slice(*mel_range))
|
||||
wav_slices.append(slice(*wav_range))
|
||||
|
||||
# Evaluate whether extra padding is warranted or not
|
||||
last_wav_range = wav_slices[-1]
|
||||
coverage = (n_samples - last_wav_range.start) / (
|
||||
last_wav_range.stop - last_wav_range.start)
|
||||
if coverage < min_pad_coverage and len(mel_slices) > 1:
|
||||
mel_slices = mel_slices[:-1]
|
||||
wav_slices = wav_slices[:-1]
|
||||
|
||||
return wav_slices, mel_slices
|
||||
|
||||
|
||||
class SpeakerVerificationPreprocessor(object):
|
||||
def __init__(self,
|
||||
sampling_rate: int,
|
||||
audio_norm_target_dBFS: float,
|
||||
vad_window_length,
|
||||
vad_moving_average_width,
|
||||
vad_max_silence_length,
|
||||
mel_window_length,
|
||||
mel_window_step,
|
||||
n_mels,
|
||||
partial_n_frames: int,
|
||||
min_pad_coverage: float=0.75,
|
||||
partial_overlap_ratio: float=0.5):
|
||||
self.sampling_rate = sampling_rate
|
||||
self.audio_norm_target_dBFS = audio_norm_target_dBFS
|
||||
|
||||
self.vad_window_length = vad_window_length
|
||||
self.vad_moving_average_width = vad_moving_average_width
|
||||
self.vad_max_silence_length = vad_max_silence_length
|
||||
|
||||
self.n_fft = int(mel_window_length * sampling_rate / 1000)
|
||||
self.hop_length = int(mel_window_step * sampling_rate / 1000)
|
||||
self.n_mels = n_mels
|
||||
|
||||
self.partial_n_frames = partial_n_frames
|
||||
self.min_pad_coverage = min_pad_coverage
|
||||
self.partial_overlap_ratio = partial_overlap_ratio
|
||||
|
||||
def preprocess_wav(self, fpath_or_wav, source_sr=None):
|
||||
# Load the wav from disk if needed
|
||||
if isinstance(fpath_or_wav, (str, Path)):
|
||||
wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
|
||||
else:
|
||||
wav = fpath_or_wav
|
||||
|
||||
# Resample if numpy.array is passed and sr does not match
|
||||
if source_sr is not None and source_sr != self.sampling_rate:
|
||||
wav = librosa.resample(wav, source_sr, self.sampling_rate)
|
||||
|
||||
# loudness normalization
|
||||
wav = normalize_volume(
|
||||
wav, self.audio_norm_target_dBFS, increase_only=True)
|
||||
|
||||
# trim long silence
|
||||
if webrtcvad:
|
||||
wav = trim_long_silences(
|
||||
wav, self.vad_window_length, self.vad_moving_average_width,
|
||||
self.vad_max_silence_length, self.sampling_rate)
|
||||
return wav
|
||||
|
||||
def melspectrogram(self, wav):
|
||||
mel = librosa.feature.melspectrogram(
|
||||
wav,
|
||||
sr=self.sampling_rate,
|
||||
n_fft=self.n_fft,
|
||||
hop_length=self.hop_length,
|
||||
n_mels=self.n_mels)
|
||||
mel = mel.astype(np.float32).T
|
||||
return mel
|
||||
|
||||
def extract_mel_partials(self, wav):
|
||||
wav_slices, mel_slices = compute_partial_slices(
|
||||
len(wav), self.partial_n_frames, self.hop_length,
|
||||
self.min_pad_coverage, self.partial_overlap_ratio)
|
||||
|
||||
# pad audio if needed
|
||||
max_wave_length = wav_slices[-1].stop
|
||||
if max_wave_length >= len(wav):
|
||||
wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
|
||||
|
||||
# Split the utterance into partials
|
||||
frames = self.melspectrogram(wav)
|
||||
frames_batch = np.array([frames[s] for s in mel_slices])
|
||||
return frames_batch # [B, T, C]
|
|
@ -0,0 +1,62 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from yacs.config import CfgNode
|
||||
|
||||
_C = CfgNode()
|
||||
|
||||
data_config = _C.data = CfgNode()
|
||||
|
||||
## Audio volume normalization
|
||||
data_config.audio_norm_target_dBFS = -30
|
||||
|
||||
## Audio sample rate
|
||||
data_config.sampling_rate = 16000 # Hz
|
||||
|
||||
## Voice Activation Detection
|
||||
# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
|
||||
# This sets the granularity of the VAD. Should not need to be changed.
|
||||
data_config.vad_window_length = 30 # In milliseconds
|
||||
# Number of frames to average together when performing the moving average smoothing.
|
||||
# The larger this value, the larger the VAD variations must be to not get smoothed out.
|
||||
data_config.vad_moving_average_width = 8
|
||||
# Maximum number of consecutive silent frames a segment can have.
|
||||
data_config.vad_max_silence_length = 6
|
||||
|
||||
## Mel-filterbank
|
||||
data_config.mel_window_length = 25 # In milliseconds
|
||||
data_config.mel_window_step = 10 # In milliseconds
|
||||
data_config.n_mels = 40 # mel bands
|
||||
|
||||
# Number of spectrogram frames in a partial utterance
|
||||
data_config.partial_n_frames = 160 # 1600 ms
|
||||
data_config.min_pad_coverage = 0.75 # at least 75% of the audio is valid in a partial
|
||||
data_config.partial_overlap_ratio = 0.5 # overlap ratio between ajancent partials
|
||||
|
||||
model_config = _C.model = CfgNode()
|
||||
model_config.num_layers = 3
|
||||
model_config.hidden_size = 256
|
||||
model_config.embedding_size = 256 # output size
|
||||
|
||||
training_config = _C.training = CfgNode()
|
||||
training_config.learning_rate_init = 1e-4
|
||||
training_config.speakers_per_batch = 64
|
||||
training_config.utterances_per_speaker = 10
|
||||
training_config.max_iteration = 1560000
|
||||
training_config.save_interval = 10000
|
||||
training_config.valid_interval = 10000
|
||||
|
||||
|
||||
def get_cfg_defaults():
|
||||
return _C.clone()
|
|
@ -0,0 +1,183 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from functools import partial
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
import multiprocessing as mp
|
||||
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
from audio_processor import SpeakerVerificationPreprocessor
|
||||
|
||||
|
||||
def _process_utterance(path_pair, processor: SpeakerVerificationPreprocessor):
|
||||
# Load and preprocess the waveform
|
||||
input_path, output_path = path_pair
|
||||
wav = processor.preprocess_wav(input_path)
|
||||
if len(wav) == 0:
|
||||
return
|
||||
|
||||
# Create the mel spectrogram, discard those that are too short
|
||||
frames = processor.melspectrogram(wav)
|
||||
if len(frames) < processor.partial_n_frames:
|
||||
return
|
||||
|
||||
np.save(output_path, frames)
|
||||
|
||||
|
||||
def _process_speaker(speaker_dir: Path,
|
||||
processor: SpeakerVerificationPreprocessor,
|
||||
datasets_root: Path,
|
||||
output_dir: Path,
|
||||
pattern: str,
|
||||
skip_existing: bool=False):
|
||||
# datastes root: a reference path to compute speaker_name
|
||||
# we prepand dataset name to speaker_id becase we are mixing serveal
|
||||
# multispeaker datasets together
|
||||
speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
|
||||
speaker_output_dir = output_dir / speaker_name
|
||||
speaker_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# load exsiting file set
|
||||
sources_fpath = speaker_output_dir / "_sources.txt"
|
||||
if sources_fpath.exists():
|
||||
try:
|
||||
with sources_fpath.open("rt") as sources_file:
|
||||
existing_names = {line.split(",")[0] for line in sources_file}
|
||||
except:
|
||||
existing_names = {}
|
||||
else:
|
||||
existing_names = {}
|
||||
|
||||
sources_file = sources_fpath.open("at" if skip_existing else "wt")
|
||||
for in_fpath in speaker_dir.rglob(pattern):
|
||||
out_name = "_".join(
|
||||
in_fpath.relative_to(speaker_dir).with_suffix(".npy").parts)
|
||||
if skip_existing and out_name in existing_names:
|
||||
continue
|
||||
out_fpath = speaker_output_dir / out_name
|
||||
_process_utterance((in_fpath, out_fpath), processor)
|
||||
sources_file.write(f"{out_name},{in_fpath}\n")
|
||||
|
||||
sources_file.close()
|
||||
|
||||
|
||||
def _process_dataset(processor: SpeakerVerificationPreprocessor,
|
||||
datasets_root: Path,
|
||||
speaker_dirs: List[Path],
|
||||
dataset_name: str,
|
||||
output_dir: Path,
|
||||
pattern: str,
|
||||
skip_existing: bool=False):
|
||||
print(
|
||||
f"{dataset_name}: Preprocessing data for {len(speaker_dirs)} speakers.")
|
||||
|
||||
_func = partial(
|
||||
_process_speaker,
|
||||
processor=processor,
|
||||
datasets_root=datasets_root,
|
||||
output_dir=output_dir,
|
||||
pattern=pattern,
|
||||
skip_existing=skip_existing)
|
||||
|
||||
with mp.Pool(16) as pool:
|
||||
list(
|
||||
tqdm(
|
||||
pool.imap(_func, speaker_dirs),
|
||||
dataset_name,
|
||||
len(speaker_dirs),
|
||||
unit="speakers"))
|
||||
print(f"Done preprocessing {dataset_name}.")
|
||||
|
||||
|
||||
def process_librispeech(processor,
|
||||
datasets_root,
|
||||
output_dir,
|
||||
skip_existing=False):
|
||||
dataset_name = "LibriSpeech/train-other-500"
|
||||
dataset_root = datasets_root / dataset_name
|
||||
speaker_dirs = list(dataset_root.glob("*"))
|
||||
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
|
||||
output_dir, "*.flac", skip_existing)
|
||||
|
||||
|
||||
def process_voxceleb1(processor,
|
||||
datasets_root,
|
||||
output_dir,
|
||||
skip_existing=False):
|
||||
dataset_name = "VoxCeleb1"
|
||||
dataset_root = datasets_root / dataset_name
|
||||
|
||||
anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
|
||||
with dataset_root.joinpath("vox1_meta.csv").open("rt") as metafile:
|
||||
metadata = [line.strip().split("\t") for line in metafile][1:]
|
||||
|
||||
# speaker id -> nationality
|
||||
nationalities = {
|
||||
line[0]: line[3]
|
||||
for line in metadata if line[-1] == "dev"
|
||||
}
|
||||
keep_speaker_ids = [
|
||||
speaker_id for speaker_id, nationality in nationalities.items()
|
||||
if nationality.lower() in anglophone_nationalites
|
||||
]
|
||||
print(
|
||||
"VoxCeleb1: using samples from {} (presumed anglophone) speakers out of {}."
|
||||
.format(len(keep_speaker_ids), len(nationalities)))
|
||||
|
||||
speaker_dirs = list((dataset_root / "wav").glob("*"))
|
||||
speaker_dirs = [
|
||||
speaker_dir for speaker_dir in speaker_dirs
|
||||
if speaker_dir.name in keep_speaker_ids
|
||||
]
|
||||
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
|
||||
output_dir, "*.wav", skip_existing)
|
||||
|
||||
|
||||
def process_voxceleb2(processor,
|
||||
datasets_root,
|
||||
output_dir,
|
||||
skip_existing=False):
|
||||
dataset_name = "VoxCeleb2"
|
||||
dataset_root = datasets_root / dataset_name
|
||||
# There is no nationality in meta data for VoxCeleb2
|
||||
speaker_dirs = list((dataset_root / "wav").glob("*"))
|
||||
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
|
||||
output_dir, "*.wav", skip_existing)
|
||||
|
||||
|
||||
def process_aidatatang_200zh(processor,
|
||||
datasets_root,
|
||||
output_dir,
|
||||
skip_existing=False):
|
||||
dataset_name = "aidatatang_200zh/train"
|
||||
dataset_root = datasets_root / dataset_name
|
||||
|
||||
speaker_dirs = list((dataset_root).glob("*"))
|
||||
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
|
||||
output_dir, "*.wav", skip_existing)
|
||||
|
||||
|
||||
def process_magicdata(processor,
|
||||
datasets_root,
|
||||
output_dir,
|
||||
skip_existing=False):
|
||||
dataset_name = "magicdata/train"
|
||||
dataset_root = datasets_root / dataset_name
|
||||
|
||||
speaker_dirs = list((dataset_root).glob("*"))
|
||||
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
|
||||
output_dir, "*.wav", skip_existing)
|
|
@ -0,0 +1,140 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import tqdm
|
||||
import paddle
|
||||
import numpy as np
|
||||
|
||||
from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder
|
||||
|
||||
from audio_processor import SpeakerVerificationPreprocessor
|
||||
from config import get_cfg_defaults
|
||||
|
||||
|
||||
def embed_utterance(processor, model, fpath_or_wav):
|
||||
# audio processor
|
||||
wav = processor.preprocess_wav(fpath_or_wav)
|
||||
mel_partials = processor.extract_mel_partials(wav)
|
||||
|
||||
model.eval()
|
||||
# speaker encoder
|
||||
with paddle.no_grad():
|
||||
mel_partials = paddle.to_tensor(mel_partials)
|
||||
with paddle.no_grad():
|
||||
embed = model.embed_utterance(mel_partials)
|
||||
embed = embed.numpy()
|
||||
return embed
|
||||
|
||||
|
||||
def _process_utterance(ifpath: Path,
|
||||
input_dir: Path,
|
||||
output_dir: Path,
|
||||
processor: SpeakerVerificationPreprocessor,
|
||||
model: LSTMSpeakerEncoder):
|
||||
rel_path = ifpath.relative_to(input_dir)
|
||||
ofpath = (output_dir / rel_path).with_suffix(".npy")
|
||||
ofpath.parent.mkdir(parents=True, exist_ok=True)
|
||||
embed = embed_utterance(processor, model, ifpath)
|
||||
np.save(ofpath, embed)
|
||||
|
||||
|
||||
def main(config, args):
|
||||
paddle.set_device(args.device)
|
||||
|
||||
# load model
|
||||
model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers,
|
||||
config.model.hidden_size,
|
||||
config.model.embedding_size)
|
||||
weights_fpath = str(Path(args.checkpoint_path).expanduser())
|
||||
model_state_dict = paddle.load(weights_fpath + ".pdparams")
|
||||
model.set_state_dict(model_state_dict)
|
||||
model.eval()
|
||||
print(f"Loaded encoder {weights_fpath}")
|
||||
|
||||
# create audio processor
|
||||
c = config.data
|
||||
processor = SpeakerVerificationPreprocessor(
|
||||
sampling_rate=c.sampling_rate,
|
||||
audio_norm_target_dBFS=c.audio_norm_target_dBFS,
|
||||
vad_window_length=c.vad_window_length,
|
||||
vad_moving_average_width=c.vad_moving_average_width,
|
||||
vad_max_silence_length=c.vad_max_silence_length,
|
||||
mel_window_length=c.mel_window_length,
|
||||
mel_window_step=c.mel_window_step,
|
||||
n_mels=c.n_mels,
|
||||
partial_n_frames=c.partial_n_frames,
|
||||
min_pad_coverage=c.min_pad_coverage,
|
||||
partial_overlap_ratio=c.min_pad_coverage, )
|
||||
|
||||
# input output preparation
|
||||
input_dir = Path(args.input).expanduser()
|
||||
ifpaths = list(input_dir.rglob(args.pattern))
|
||||
print(f"{len(ifpaths)} utterances in total")
|
||||
output_dir = Path(args.output).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for ifpath in tqdm.tqdm(ifpaths, unit="utterance"):
|
||||
_process_utterance(ifpath, input_dir, output_dir, processor, model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
parser = argparse.ArgumentParser(description="compute utterance embed.")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
metavar="FILE",
|
||||
help="path of the config file to overwrite to default config with.")
|
||||
parser.add_argument(
|
||||
"--input", type=str, help="path of the audio_file folder.")
|
||||
parser.add_argument(
|
||||
"--pattern",
|
||||
type=str,
|
||||
default="*.wav",
|
||||
help="pattern to filter audio files.")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
metavar="OUTPUT_DIR",
|
||||
help="path to save checkpoint and logs.")
|
||||
|
||||
# load from saved checkpoint
|
||||
parser.add_argument(
|
||||
"--checkpoint_path", type=str, help="path of the checkpoint to load")
|
||||
|
||||
# running
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
type=str,
|
||||
choices=["cpu", "gpu"],
|
||||
help="device type to use, cpu and gpu are supported.")
|
||||
|
||||
# overwrite extra config and default config
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|
|
@ -0,0 +1,100 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from config import get_cfg_defaults
|
||||
from audio_processor import SpeakerVerificationPreprocessor
|
||||
from dataset_processors import (process_librispeech, process_voxceleb1,
|
||||
process_voxceleb2, process_aidatatang_200zh,
|
||||
process_magicdata)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="preprocess dataset for speaker verification task")
|
||||
parser.add_argument(
|
||||
"--datasets_root",
|
||||
type=Path,
|
||||
help="Path to the directory containing your LibriSpeech, LibriTTS and VoxCeleb datasets."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir", type=Path, help="Path to save processed dataset.")
|
||||
parser.add_argument(
|
||||
"--dataset_names",
|
||||
type=str,
|
||||
default="librispeech_other,voxceleb1,voxceleb2",
|
||||
help="comma-separated list of names of the datasets you want to preprocess. only "
|
||||
"the train set of these datastes will be used. Possible names: librispeech_other, "
|
||||
"voxceleb1, voxceleb2, aidatatang_200zh, magicdata.")
|
||||
parser.add_argument(
|
||||
"--skip_existing",
|
||||
action="store_true",
|
||||
help="Whether to skip ouput files with the same name. Useful if this script was interrupted."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no_trim",
|
||||
action="store_true",
|
||||
help="Preprocess audio without trimming silences (not recommended).")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.no_trim:
|
||||
try:
|
||||
import webrtcvad
|
||||
except:
|
||||
raise ModuleNotFoundError(
|
||||
"Package 'webrtcvad' not found. This package enables "
|
||||
"noise removal and is recommended. Please install and "
|
||||
"try again. If installation fails, "
|
||||
"use --no_trim to disable this error message.")
|
||||
del args.no_trim
|
||||
|
||||
args.datasets = [item.strip() for item in args.dataset_names.split(",")]
|
||||
if not hasattr(args, "output_dir"):
|
||||
args.output_dir = args.dataset_root / "SV2TTS" / "encoder"
|
||||
|
||||
args.output_dir = args.output_dir.expanduser()
|
||||
args.datasets_root = args.datasets_root.expanduser()
|
||||
assert args.datasets_root.exists()
|
||||
args.output_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
config = get_cfg_defaults()
|
||||
print(args)
|
||||
|
||||
c = config.data
|
||||
processor = SpeakerVerificationPreprocessor(
|
||||
sampling_rate=c.sampling_rate,
|
||||
audio_norm_target_dBFS=c.audio_norm_target_dBFS,
|
||||
vad_window_length=c.vad_window_length,
|
||||
vad_moving_average_width=c.vad_moving_average_width,
|
||||
vad_max_silence_length=c.vad_max_silence_length,
|
||||
mel_window_length=c.mel_window_length,
|
||||
mel_window_step=c.mel_window_step,
|
||||
n_mels=c.n_mels,
|
||||
partial_n_frames=c.partial_n_frames,
|
||||
min_pad_coverage=c.min_pad_coverage,
|
||||
partial_overlap_ratio=c.min_pad_coverage, )
|
||||
|
||||
preprocess_func = {
|
||||
"librispeech_other": process_librispeech,
|
||||
"voxceleb1": process_voxceleb1,
|
||||
"voxceleb2": process_voxceleb2,
|
||||
"aidatatang_200zh": process_aidatatang_200zh,
|
||||
"magicdata": process_magicdata,
|
||||
}
|
||||
|
||||
for dataset in args.datasets:
|
||||
print("Preprocessing %s" % dataset)
|
||||
preprocess_func[dataset](processor, args.datasets_root,
|
||||
args.output_dir, args.skip_existing)
|
|
@ -0,0 +1,39 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
|
||||
|
||||
def cycle(iterable):
|
||||
# cycle('ABCD') --> A B C D A B C D A B C D ...
|
||||
saved = []
|
||||
for element in iterable:
|
||||
yield element
|
||||
saved.append(element)
|
||||
while saved:
|
||||
for element in saved:
|
||||
yield element
|
||||
|
||||
|
||||
def random_cycle(iterable):
|
||||
# cycle('ABCD') --> A B C D B C D A A D B C ...
|
||||
saved = []
|
||||
for element in iterable:
|
||||
yield element
|
||||
saved.append(element)
|
||||
random.shuffle(saved)
|
||||
while saved:
|
||||
for element in saved:
|
||||
yield element
|
||||
random.shuffle(saved)
|
|
@ -0,0 +1,131 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from paddle.io import Dataset, BatchSampler
|
||||
|
||||
from random_cycle import random_cycle
|
||||
|
||||
|
||||
class MultiSpeakerMelDataset(Dataset):
|
||||
"""A 2 layer directory thatn contains mel spectrograms in *.npy format.
|
||||
An Example file structure tree is shown below. We prefer to preprocess
|
||||
raw datasets and organized them like this.
|
||||
|
||||
dataset_root/
|
||||
speaker1/
|
||||
utterance1.npy
|
||||
utterance2.npy
|
||||
utterance3.npy
|
||||
speaker2/
|
||||
utterance1.npy
|
||||
utterance2.npy
|
||||
utterance3.npy
|
||||
"""
|
||||
|
||||
def __init__(self, dataset_root: Path):
|
||||
self.root = Path(dataset_root).expanduser()
|
||||
speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
|
||||
|
||||
speaker_utterances = {
|
||||
speaker_dir: list(speaker_dir.glob("*.npy"))
|
||||
for speaker_dir in speaker_dirs
|
||||
}
|
||||
|
||||
self.speaker_dirs = speaker_dirs
|
||||
self.speaker_to_utterances = speaker_utterances
|
||||
|
||||
# meta data
|
||||
self.num_speakers = len(self.speaker_dirs)
|
||||
self.num_utterances = np.sum(
|
||||
len(utterances)
|
||||
for speaker, utterances in self.speaker_to_utterances.items())
|
||||
|
||||
def get_example_by_index(self, speaker_index, utterance_index):
|
||||
speaker_dir = self.speaker_dirs[speaker_index]
|
||||
fpath = self.speaker_to_utterances[speaker_dir][utterance_index]
|
||||
return self[fpath]
|
||||
|
||||
def __getitem__(self, fpath):
|
||||
return np.load(fpath)
|
||||
|
||||
def __len__(self):
|
||||
return int(self.num_utterances)
|
||||
|
||||
|
||||
class MultiSpeakerSampler(BatchSampler):
|
||||
"""A multi-stratal sampler designed for speaker verification task.
|
||||
First, N speakers from all speakers are sampled randomly. Then, for each
|
||||
speaker, randomly sample M utterances from their corresponding utterances.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dataset: MultiSpeakerMelDataset,
|
||||
speakers_per_batch: int,
|
||||
utterances_per_speaker: int):
|
||||
self._speakers = list(dataset.speaker_dirs)
|
||||
self._speaker_to_utterances = dataset.speaker_to_utterances
|
||||
|
||||
self.speakers_per_batch = speakers_per_batch
|
||||
self.utterances_per_speaker = utterances_per_speaker
|
||||
|
||||
def __iter__(self):
|
||||
# yield list of Paths
|
||||
speaker_generator = iter(random_cycle(self._speakers))
|
||||
speaker_utterances_generator = {
|
||||
s: iter(random_cycle(us))
|
||||
for s, us in self._speaker_to_utterances.items()
|
||||
}
|
||||
|
||||
while True:
|
||||
speakers = []
|
||||
for _ in range(self.speakers_per_batch):
|
||||
speakers.append(next(speaker_generator))
|
||||
|
||||
utterances = []
|
||||
for s in speakers:
|
||||
us = speaker_utterances_generator[s]
|
||||
for _ in range(self.utterances_per_speaker):
|
||||
utterances.append(next(us))
|
||||
yield utterances
|
||||
|
||||
|
||||
class RandomClip(object):
|
||||
def __init__(self, frames):
|
||||
self.frames = frames
|
||||
|
||||
def __call__(self, spec):
|
||||
# spec [T, C]
|
||||
T = spec.shape[0]
|
||||
start = random.randint(0, T - self.frames)
|
||||
return spec[start:start + self.frames, :]
|
||||
|
||||
|
||||
class Collate(object):
|
||||
def __init__(self, num_frames):
|
||||
self.random_crop = RandomClip(num_frames)
|
||||
|
||||
def __call__(self, examples):
|
||||
frame_clips = [self.random_crop(mel) for mel in examples]
|
||||
batced_clips = np.stack(frame_clips)
|
||||
return batced_clips
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
mydataset = MultiSpeakerMelDataset(
|
||||
Path("/home/chenfeiyu/datasets/SV2TTS/encoder"))
|
||||
print(mydataset.get_example_by_index(0, 10))
|
|
@ -0,0 +1,126 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import time
|
||||
|
||||
from paddle import distributed as dist
|
||||
from paddle.optimizer import Adam
|
||||
from paddle import DataParallel
|
||||
from paddle.io import DataLoader
|
||||
from paddle.nn.clip import ClipGradByGlobalNorm
|
||||
|
||||
from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder
|
||||
from parakeet.training import ExperimentBase
|
||||
from parakeet.training import default_argument_parser
|
||||
|
||||
from speaker_verification_dataset import MultiSpeakerMelDataset
|
||||
from speaker_verification_dataset import MultiSpeakerSampler
|
||||
from speaker_verification_dataset import Collate
|
||||
from config import get_cfg_defaults
|
||||
|
||||
|
||||
class Ge2eExperiment(ExperimentBase):
|
||||
def setup_model(self):
|
||||
config = self.config
|
||||
model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers,
|
||||
config.model.hidden_size,
|
||||
config.model.embedding_size)
|
||||
optimizer = Adam(
|
||||
config.training.learning_rate_init,
|
||||
parameters=model.parameters(),
|
||||
grad_clip=ClipGradByGlobalNorm(3))
|
||||
self.model = DataParallel(model) if self.parallel else model
|
||||
self.model_core = model
|
||||
self.optimizer = optimizer
|
||||
|
||||
def setup_dataloader(self):
|
||||
config = self.config
|
||||
train_dataset = MultiSpeakerMelDataset(self.args.data)
|
||||
sampler = MultiSpeakerSampler(train_dataset,
|
||||
config.training.speakers_per_batch,
|
||||
config.training.utterances_per_speaker)
|
||||
train_loader = DataLoader(
|
||||
train_dataset,
|
||||
batch_sampler=sampler,
|
||||
collate_fn=Collate(config.data.partial_n_frames),
|
||||
num_workers=16)
|
||||
|
||||
self.train_dataset = train_dataset
|
||||
self.train_loader = train_loader
|
||||
|
||||
def train_batch(self):
|
||||
start = time.time()
|
||||
batch = self.read_batch()
|
||||
data_loader_time = time.time() - start
|
||||
|
||||
self.optimizer.clear_grad()
|
||||
self.model.train()
|
||||
specs = batch
|
||||
loss, eer = self.model(specs, self.config.training.speakers_per_batch)
|
||||
loss.backward()
|
||||
self.model_core.do_gradient_ops()
|
||||
self.optimizer.step()
|
||||
iteration_time = time.time() - start
|
||||
|
||||
# logging
|
||||
loss_value = float(loss)
|
||||
msg = "Rank: {}, ".format(dist.get_rank())
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||
iteration_time)
|
||||
msg += 'loss: {:>.6f} err: {:>.6f}'.format(loss_value, eer)
|
||||
self.logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
self.visualizer.add_scalar("train/loss", loss_value,
|
||||
self.iteration)
|
||||
self.visualizer.add_scalar("train/eer", eer, self.iteration)
|
||||
self.visualizer.add_scalar(
|
||||
"param/w",
|
||||
float(self.model_core.similarity_weight), self.iteration)
|
||||
self.visualizer.add_scalar("param/b",
|
||||
float(self.model_core.similarity_bias),
|
||||
self.iteration)
|
||||
|
||||
def valid(self):
|
||||
pass
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = Ge2eExperiment(config, args)
|
||||
exp.setup()
|
||||
exp.resume_or_load()
|
||||
exp.run()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
if args.nprocs > 1 and args.device == "gpu":
|
||||
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
|
||||
else:
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
parser = default_argument_parser()
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|
|
@ -0,0 +1,92 @@
|
|||
# Tacotron2
|
||||
|
||||
PaddlePaddle dynamic graph implementation of Tacotron2, a neural network architecture for speech synthesis directly from text. The implementation is based on [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884).
|
||||
|
||||
## Project Structure
|
||||
|
||||
```text
|
||||
├── config.py # default configuration file
|
||||
├── ljspeech.py # dataset and dataloader settings for LJSpeech
|
||||
├── preprocess.py # script to preprocess LJSpeech dataset
|
||||
├── synthesize.py # script to synthesize spectrogram from text
|
||||
├── train.py # script for tacotron2 model training
|
||||
├── synthesize.ipynb # notebook example for end-to-end TTS
|
||||
```
|
||||
|
||||
## Dataset
|
||||
|
||||
We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
|
||||
|
||||
```bash
|
||||
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
|
||||
tar xjvf LJSpeech-1.1.tar.bz2
|
||||
```
|
||||
|
||||
Then you need to preprocess the data by running ``preprocess.py``, the preprocessed data will be placed in ``--output`` directory.
|
||||
|
||||
```bash
|
||||
python preprocess.py \
|
||||
--input=${DATAPATH} \
|
||||
--output=${PREPROCESSEDDATAPATH} \
|
||||
-v \
|
||||
```
|
||||
|
||||
For more help on arguments
|
||||
|
||||
``python preprocess.py --help``.
|
||||
|
||||
## Train the model
|
||||
|
||||
Tacotron2 model can be trained by running ``train.py``.
|
||||
|
||||
```bash
|
||||
python train.py \
|
||||
--data=${PREPROCESSEDDATAPATH} \
|
||||
--output=${OUTPUTPATH} \
|
||||
--device=gpu \
|
||||
```
|
||||
|
||||
If you want to train on CPU, just set ``--device=cpu``.
|
||||
If you want to train on multiple GPUs, just set ``--nprocs`` as num of GPU.
|
||||
By default, training will be resumed from the latest checkpoint in ``--output``, if you want to start a new training, please use a new ``${OUTPUTPATH}`` with no checkpoint. And if you want to resume from an other existing model, you should set ``checkpoint_path`` to be the checkpoint path you want to load.
|
||||
|
||||
**Note: The checkpoint path cannot contain the file extension.**
|
||||
|
||||
For more help on arguments
|
||||
|
||||
``python train_transformer.py --help``.
|
||||
|
||||
## Synthesis
|
||||
|
||||
After training the Tacotron2, spectrogram can be synthesized by running ``synthesis.py``.
|
||||
|
||||
```bash
|
||||
python synthesis.py \
|
||||
--config=${CONFIGPATH} \
|
||||
--checkpoint_path=${CHECKPOINTPATH} \
|
||||
--input=${TEXTPATH} \
|
||||
--output=${OUTPUTPATH}
|
||||
--device=gpu
|
||||
```
|
||||
|
||||
The ``${CONFIGPATH}`` needs to be matched with ``${CHECKPOINTPATH}``.
|
||||
|
||||
For more help on arguments
|
||||
|
||||
``python synthesis.py --help``.
|
||||
|
||||
Then you can find the spectrogram files in ``${OUTPUTPATH}``, and then they can be the input of vocoder like [waveflow](../waveflow/README.md#Synthesis) to get audio files.
|
||||
|
||||
|
||||
## Pretrained Models
|
||||
|
||||
Pretrained Models can be downloaded from links below. We provide 2 models with different configurations.
|
||||
|
||||
1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
|
||||
|
||||
2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
|
||||
|
||||
|
||||
## Notebook: End-to-end TTS
|
||||
|
||||
See [synthesize.ipynb](./synthesize.ipynb) for details about end-to-end TTS with tacotron2 and waveflow.
|
|
@ -0,0 +1,76 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from yacs.config import CfgNode as CN
|
||||
|
||||
_C = CN()
|
||||
_C.data = CN(
|
||||
dict(
|
||||
batch_size=32, # batch size
|
||||
valid_size=64, # the first N examples are reserved for validation
|
||||
sample_rate=22050, # Hz, sample rate
|
||||
n_fft=1024, # fft frame size
|
||||
win_length=1024, # window size
|
||||
hop_length=256, # hop size between ajacent frame
|
||||
fmax=8000, # Hz, max frequency when converting to mel
|
||||
fmin=0, # Hz, min frequency when converting to mel
|
||||
n_mels=80, # mel bands
|
||||
padding_idx=0, # text embedding's padding index
|
||||
))
|
||||
|
||||
_C.model = CN(
|
||||
dict(
|
||||
vocab_size=37, # set this according to the frontend's vocab size
|
||||
n_tones=None,
|
||||
reduction_factor=1, # reduction factor
|
||||
d_encoder=512, # embedding & encoder's internal size
|
||||
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
|
||||
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
|
||||
d_prenet=256, # hidden size of decoder prenet
|
||||
d_attention_rnn=1024, # hidden size of the first rnn layer in tacotron2 decoder
|
||||
d_decoder_rnn=1024, # hidden size of the second rnn layer in tacotron2 decoder
|
||||
d_attention=128, # hidden size of decoder location linear layer
|
||||
attention_filters=32, # number of filter in decoder location conv layer
|
||||
attention_kernel_size=31, # kernel size of decoder location conv layer
|
||||
d_postnet=512, # hidden size of decoder postnet
|
||||
postnet_kernel_size=5, # kernel size of conv layers in postnet
|
||||
postnet_conv_layers=5, # number of conv layer in decoder postnet
|
||||
p_encoder_dropout=0.5, # droput probability in encoder
|
||||
p_prenet_dropout=0.5, # droput probability in decoder prenet
|
||||
p_attention_dropout=0.1, # droput probability of first rnn layer in decoder
|
||||
p_decoder_dropout=0.1, # droput probability of second rnn layer in decoder
|
||||
p_postnet_dropout=0.5, # droput probability in decoder postnet
|
||||
d_global_condition=None,
|
||||
use_stop_token=True, # wherther to use binary classifier to predict when to stop
|
||||
use_guided_attention_loss=False, # whether to use guided attention loss
|
||||
guided_attention_loss_sigma=0.2 # sigma in guided attention loss
|
||||
))
|
||||
|
||||
_C.training = CN(
|
||||
dict(
|
||||
lr=1e-3, # learning rate
|
||||
weight_decay=1e-6, # the coeff of weight decay
|
||||
grad_clip_thresh=1.0, # the clip norm of grad clip.
|
||||
plot_interval=1000, # plot attention and spectrogram
|
||||
valid_interval=1000, # validation
|
||||
save_interval=1000, # checkpoint
|
||||
max_iteration=500000, # max iteration to train
|
||||
))
|
||||
|
||||
|
||||
def get_cfg_defaults():
|
||||
"""Get a yacs CfgNode object with default values for my_project."""
|
||||
# Return a clone so that the defaults will not be altered
|
||||
# This is for the "local variable" use pattern
|
||||
return _C.clone()
|
|
@ -0,0 +1,95 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from pathlib import Path
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
from paddle.io import Dataset
|
||||
|
||||
from parakeet.data.batch import batch_spec, batch_text_id
|
||||
|
||||
|
||||
class LJSpeech(Dataset):
|
||||
"""A simple dataset adaptor for the processed ljspeech dataset."""
|
||||
|
||||
def __init__(self, root):
|
||||
self.root = Path(root).expanduser()
|
||||
records = []
|
||||
with open(self.root / "metadata.pkl", 'rb') as f:
|
||||
metadata = pickle.load(f)
|
||||
for mel_name, text, ids in metadata:
|
||||
mel_name = self.root / "mel" / (mel_name + ".npy")
|
||||
records.append((mel_name, text, ids))
|
||||
self.records = records
|
||||
|
||||
def __getitem__(self, i):
|
||||
mel_name, _, ids = self.records[i]
|
||||
mel = np.load(mel_name)
|
||||
return ids, mel
|
||||
|
||||
def __len__(self):
|
||||
return len(self.records)
|
||||
|
||||
|
||||
class LJSpeechCollector(object):
|
||||
"""A simple callable to batch LJSpeech examples."""
|
||||
|
||||
def __init__(self, padding_idx=0, padding_value=0.,
|
||||
padding_stop_token=1.0):
|
||||
self.padding_idx = padding_idx
|
||||
self.padding_value = padding_value
|
||||
self.padding_stop_token = padding_stop_token
|
||||
|
||||
def __call__(self, examples):
|
||||
texts = []
|
||||
mels = []
|
||||
text_lens = []
|
||||
mel_lens = []
|
||||
|
||||
for data in examples:
|
||||
text, mel = data
|
||||
text = np.array(text, dtype=np.int64)
|
||||
text_lens.append(len(text))
|
||||
mels.append(mel)
|
||||
texts.append(text)
|
||||
mel_lens.append(mel.shape[1])
|
||||
|
||||
# Sort by text_len in descending order
|
||||
texts = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(texts, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
mels = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(mels, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
|
||||
mel_lens = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
|
||||
mel_lens = np.array(mel_lens, dtype=np.int64)
|
||||
text_lens = np.array(sorted(text_lens, reverse=True), dtype=np.int64)
|
||||
|
||||
# Pad sequence with largest len of the batch
|
||||
texts, _ = batch_text_id(texts, pad_id=self.padding_idx)
|
||||
mels, _ = batch_spec(mels, pad_value=self.padding_value)
|
||||
mels = np.transpose(mels, axes=(0, 2, 1))
|
||||
|
||||
return texts, mels, text_lens, mel_lens
|
|
@ -0,0 +1,100 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import tqdm
|
||||
import numpy as np
|
||||
|
||||
from parakeet.datasets import LJSpeechMetaData
|
||||
from parakeet.audio import AudioProcessor, LogMagnitude
|
||||
from parakeet.frontend import EnglishCharacter
|
||||
|
||||
from config import get_cfg_defaults
|
||||
|
||||
|
||||
def create_dataset(config, source_path, target_path, verbose=False):
|
||||
# create output dir
|
||||
target_path = Path(target_path).expanduser()
|
||||
mel_path = target_path / "mel"
|
||||
os.makedirs(mel_path, exist_ok=True)
|
||||
|
||||
meta_data = LJSpeechMetaData(source_path)
|
||||
frontend = EnglishCharacter()
|
||||
processor = AudioProcessor(
|
||||
sample_rate=config.data.sample_rate,
|
||||
n_fft=config.data.n_fft,
|
||||
n_mels=config.data.n_mels,
|
||||
win_length=config.data.win_length,
|
||||
hop_length=config.data.hop_length,
|
||||
fmax=config.data.fmax,
|
||||
fmin=config.data.fmin)
|
||||
normalizer = LogMagnitude()
|
||||
|
||||
records = []
|
||||
for (fname, text, _) in tqdm.tqdm(meta_data):
|
||||
wav = processor.read_wav(fname)
|
||||
mel = processor.mel_spectrogram(wav)
|
||||
mel = normalizer.transform(mel)
|
||||
ids = frontend(text)
|
||||
mel_name = os.path.splitext(os.path.basename(fname))[0]
|
||||
|
||||
# save mel spectrogram
|
||||
records.append((mel_name, text, ids))
|
||||
np.save(mel_path / mel_name, mel)
|
||||
if verbose:
|
||||
print("save mel spectrograms into {}".format(mel_path))
|
||||
|
||||
# save meta data as pickle archive
|
||||
with open(target_path / "metadata.pkl", 'wb') as f:
|
||||
pickle.dump(records, f)
|
||||
if verbose:
|
||||
print("saved metadata into {}".format(target_path /
|
||||
"metadata.pkl"))
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="create dataset")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--input", type=str, help="path of the ljspeech dataset")
|
||||
parser.add_argument(
|
||||
"--output", type=str, help="path to save output dataset")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
config = get_cfg_defaults()
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config.data)
|
||||
|
||||
create_dataset(config, args.input, args.output, args.verbose)
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,96 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import paddle
|
||||
import numpy as np
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
from parakeet.frontend import EnglishCharacter
|
||||
from parakeet.models.tacotron2 import Tacotron2
|
||||
from parakeet.utils import display
|
||||
|
||||
from config import get_cfg_defaults
|
||||
|
||||
|
||||
def main(config, args):
|
||||
paddle.set_device(args.device)
|
||||
|
||||
# model
|
||||
frontend = EnglishCharacter()
|
||||
model = Tacotron2.from_pretrained(config, args.checkpoint_path)
|
||||
model.eval()
|
||||
|
||||
# inputs
|
||||
input_path = Path(args.input).expanduser()
|
||||
with open(input_path, "rt") as f:
|
||||
sentences = f.readlines()
|
||||
|
||||
if args.output is None:
|
||||
output_dir = input_path.parent / "synthesis"
|
||||
else:
|
||||
output_dir = Path(args.output).expanduser()
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
for i, sentence in enumerate(sentences):
|
||||
sentence = paddle.to_tensor(frontend(sentence)).unsqueeze(0)
|
||||
|
||||
outputs = model.infer(sentence)
|
||||
mel_output = outputs["mel_outputs_postnet"][0].numpy().T
|
||||
alignment = outputs["alignments"][0].numpy().T
|
||||
|
||||
np.save(str(output_dir / f"sentence_{i}"), mel_output)
|
||||
display.plot_alignment(alignment)
|
||||
plt.savefig(str(output_dir / f"sentence_{i}.png"))
|
||||
if args.verbose:
|
||||
print("spectrogram saved at {}".format(output_dir /
|
||||
f"sentence_{i}.npy"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="generate mel spectrogram with TransformerTTS.")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
||||
parser.add_argument("--input", type=str, help="path of the text sentences")
|
||||
parser.add_argument("--output", type=str, help="path to save outputs")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="cpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|
|
@ -0,0 +1,220 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import time
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
|
||||
import paddle
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader, DistributedBatchSampler
|
||||
|
||||
from parakeet.data import dataset
|
||||
from parakeet.frontend import EnglishCharacter # pylint: disable=unused-import
|
||||
from parakeet.training.cli import default_argument_parser
|
||||
from parakeet.training.experiment import ExperimentBase
|
||||
from parakeet.utils import display, mp_tools
|
||||
from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss
|
||||
|
||||
from config import get_cfg_defaults
|
||||
from ljspeech import LJSpeech, LJSpeechCollector
|
||||
|
||||
|
||||
class Experiment(ExperimentBase):
|
||||
def compute_losses(self, inputs, outputs):
|
||||
texts, mel_targets, plens, slens = inputs
|
||||
|
||||
mel_outputs = outputs["mel_output"]
|
||||
mel_outputs_postnet = outputs["mel_outputs_postnet"]
|
||||
attention_weight = outputs["alignments"]
|
||||
if self.config.model.use_stop_token:
|
||||
stop_logits = outputs["stop_logits"]
|
||||
else:
|
||||
stop_logits = None
|
||||
|
||||
losses = self.criterion(mel_outputs, mel_outputs_postnet, mel_targets,
|
||||
attention_weight, slens, plens, stop_logits)
|
||||
return losses
|
||||
|
||||
def train_batch(self):
|
||||
start = time.time()
|
||||
batch = self.read_batch()
|
||||
data_loader_time = time.time() - start
|
||||
|
||||
self.optimizer.clear_grad()
|
||||
self.model.train()
|
||||
texts, mels, text_lens, output_lens = batch
|
||||
outputs = self.model(texts, text_lens, mels, output_lens)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
loss = losses["loss"]
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
iteration_time = time.time() - start
|
||||
|
||||
losses_np = {k: float(v) for k, v in losses.items()}
|
||||
# logging
|
||||
msg = "Rank: {}, ".format(dist.get_rank())
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||
iteration_time)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_np.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
for k, v in losses_np.items():
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v,
|
||||
self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def valid(self):
|
||||
valid_losses = defaultdict(list)
|
||||
for i, batch in enumerate(self.valid_loader):
|
||||
texts, mels, text_lens, output_lens = batch
|
||||
outputs = self.model(texts, text_lens, mels, output_lens)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
for k, v in losses.items():
|
||||
valid_losses[k].append(float(v))
|
||||
|
||||
attention_weights = outputs["alignments"]
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_alignments",
|
||||
display.plot_alignment(attention_weights[0].numpy().T),
|
||||
self.iteration)
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_target_spectrogram",
|
||||
display.plot_spectrogram(mels[0].numpy().T), self.iteration)
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_predicted_spectrogram",
|
||||
display.plot_spectrogram(outputs['mel_outputs_postnet'][0]
|
||||
.numpy().T), self.iteration)
|
||||
|
||||
# write visual log
|
||||
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||
|
||||
# logging
|
||||
msg = "Valid: "
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in valid_losses.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
for k, v in valid_losses.items():
|
||||
self.visualizer.add_scalar(f"valid/{k}", v, self.iteration)
|
||||
|
||||
def setup_model(self):
|
||||
config = self.config
|
||||
model = Tacotron2(
|
||||
vocab_size=config.model.vocab_size,
|
||||
d_mels=config.data.n_mels,
|
||||
d_encoder=config.model.d_encoder,
|
||||
encoder_conv_layers=config.model.encoder_conv_layers,
|
||||
encoder_kernel_size=config.model.encoder_kernel_size,
|
||||
d_prenet=config.model.d_prenet,
|
||||
d_attention_rnn=config.model.d_attention_rnn,
|
||||
d_decoder_rnn=config.model.d_decoder_rnn,
|
||||
attention_filters=config.model.attention_filters,
|
||||
attention_kernel_size=config.model.attention_kernel_size,
|
||||
d_attention=config.model.d_attention,
|
||||
d_postnet=config.model.d_postnet,
|
||||
postnet_kernel_size=config.model.postnet_kernel_size,
|
||||
postnet_conv_layers=config.model.postnet_conv_layers,
|
||||
reduction_factor=config.model.reduction_factor,
|
||||
p_encoder_dropout=config.model.p_encoder_dropout,
|
||||
p_prenet_dropout=config.model.p_prenet_dropout,
|
||||
p_attention_dropout=config.model.p_attention_dropout,
|
||||
p_decoder_dropout=config.model.p_decoder_dropout,
|
||||
p_postnet_dropout=config.model.p_postnet_dropout,
|
||||
use_stop_token=config.model.use_stop_token)
|
||||
|
||||
if self.parallel:
|
||||
model = paddle.DataParallel(model)
|
||||
|
||||
grad_clip = paddle.nn.ClipGradByGlobalNorm(
|
||||
config.training.grad_clip_thresh)
|
||||
optimizer = paddle.optimizer.Adam(
|
||||
learning_rate=config.training.lr,
|
||||
parameters=model.parameters(),
|
||||
weight_decay=paddle.regularizer.L2Decay(
|
||||
config.training.weight_decay),
|
||||
grad_clip=grad_clip)
|
||||
criterion = Tacotron2Loss(
|
||||
use_stop_token_loss=config.model.use_stop_token,
|
||||
use_guided_attention_loss=config.model.use_guided_attention_loss,
|
||||
sigma=config.model.guided_attention_loss_sigma)
|
||||
self.model = model
|
||||
self.optimizer = optimizer
|
||||
self.criterion = criterion
|
||||
|
||||
def setup_dataloader(self):
|
||||
args = self.args
|
||||
config = self.config
|
||||
ljspeech_dataset = LJSpeech(args.data)
|
||||
|
||||
valid_set, train_set = dataset.split(ljspeech_dataset,
|
||||
config.data.valid_size)
|
||||
batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
|
||||
|
||||
if not self.parallel:
|
||||
self.train_loader = DataLoader(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
else:
|
||||
sampler = DistributedBatchSampler(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
self.train_loader = DataLoader(
|
||||
train_set, batch_sampler=sampler, collate_fn=batch_fn)
|
||||
|
||||
self.valid_loader = DataLoader(
|
||||
valid_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = Experiment(config, args)
|
||||
exp.setup()
|
||||
exp.resume_or_load()
|
||||
exp.run()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
if args.nprocs > 1 and args.device == "gpu":
|
||||
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
|
||||
else:
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
parser = default_argument_parser()
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|
|
@ -0,0 +1,112 @@
|
|||
## Tacotron2 + AISHELL-3 数据集训练语音克隆模型
|
||||
|
||||
本实验的内容是利用 AISHELL-3 数据集和 Tacotron 2 模型进行语音克隆任务,使用的模型大体结构和论文 [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) 相同。大致步骤如下:
|
||||
|
||||
1. Speaker Encoder: 我们使用了一个 Speaker Verification 任务训练一个 speaker encoder。这部分任务所用的数据集和训练 Tacotron 2 的数据集不同,因为不需要 transcription 的缘故,我们使用了较多的训练数据,可以参考实现 [ge2e](../ge2e)。
|
||||
2. Synthesizer: 然后使用训练好的 speaker encoder 为 AISHELL-3 数据集中的每个句子生成对应的 utterance embedding. 这个 Embedding 作为 Tacotron 模型中的一个额外输入和 encoder outputs 拼接在一起。
|
||||
3. Vocoder: 我们使用的声码器是 WaveFlow,参考实验 [waveflow](../waveflow).
|
||||
|
||||
## 数据处理
|
||||
|
||||
### utterance embedding 的生成
|
||||
|
||||
使用训练好的 speaker encoder 为 AISHELL-3 数据集中的每个句子生成对应的 utterance embedding. 以和音频文件夹同构的方式存储。存储格式是 `.npy` 文件。
|
||||
|
||||
首先 cd 到 [ge2e](../ge2e) 文件夹。下载训练好的 [模型](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip),然后运行脚本生成每个句子的 utterance embedding.
|
||||
|
||||
```bash
|
||||
python inference.py --input=<intput> --output=<output> --device="gpu" --checkpoint_path=<pretrained checkpoint>
|
||||
```
|
||||
|
||||
其中 input 是只包含音频文件夹的文件。这里可以用 `~/datasets/aishell3/train/wav`,然后 output 是用于存储 utterance embed 的文件夹,这里可以用 `~/datasets/aishell3/train/embed`。Utterance embedding 会以和音频文件夹相同的文件结构存储,格式为 `.npy`.
|
||||
|
||||
utterance embedding 的计算可能会用几个小时的时间,请耐心等待。
|
||||
|
||||
### 音频处理
|
||||
|
||||
因为 AISHELL-3 数据集前后有一些空白,静音片段,而且语音幅值很小,所以我们需要进行空白移除和音量规范化。空白移除可以简单的使用基于音量或者能量的方法,但是效果不是很好,对于不同的句子很难取到一个一致的阈值。我们使用的是先利用 Force Aligner 进行文本和语音的对齐。然后根据对齐结果截除空白。
|
||||
|
||||
我们使用的工具是 Montreal Force Aligner 1.0. 因为 aishell 的标注包含拼音标注,所以我们提供给 Montreal Force Aligner 的是拼音 transcription 而不是汉字 transcription. 而且需要把其中的韵律标记(`$` 和 `%`)去除,并且处理成 Montreal Force Alinger 所需要的文件形式。和音频同名的文本文件,扩展名为 `.lab`.
|
||||
|
||||
此外还需要准备词典文件。其中包含把拼音序列转换为 phone 序列的映射关系。在这里我们只做声母和韵母的切分,而声调则归为韵母的一部分。我们使用的[词典文件](./lexicon.txt)可以下载。
|
||||
|
||||
准备好之后运行训练和对齐。首先下载 [Montreal Force Aligner 1.0](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/tag/v1.0.1).下载之后解压即可运行。cd 到其中的 bin 文件夹运行命令,即可进行训练和对齐。前三个命令行参数分别是音频文件夹的路径,词典路径和对齐文件输出路径。可以通过`-o` 传入训练得到的模型保存路径。
|
||||
|
||||
```bash
|
||||
./mfa_train_and_align \
|
||||
~/datasets/aishell3/train/wav \
|
||||
lexicon.txt \
|
||||
~/datasets/aishell3/train/alignment \
|
||||
-o aishell3_model \
|
||||
-v
|
||||
```
|
||||
|
||||
因为训练和对齐的时间比较长。我们提供了对齐后的 [alignment 文件](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz),其中每个句子对应的文件为 `.TextGrid` 格式的文本。
|
||||
|
||||
得到了对齐文件之后,可以运行 `process_wav.py` 脚本来处理音频。
|
||||
|
||||
```bash
|
||||
python process_wav.py --input=<input> --output=<output> --alignment=<alignment>
|
||||
```
|
||||
|
||||
默认 input, output, alignment 分别是 `~/datasets/aishell3/train/wav`, `~/datasets/aishell3/train/normalized_wav`, `~/datasets/aishell3/train/alignment`.
|
||||
|
||||
处理结束后,会将处理好的音频保存在 `<output>` 文件夹中。
|
||||
|
||||
### 转录文本处理
|
||||
|
||||
把文本转换成为 phone 和 tone 的形式,并存储起来。值得注意的是,这里我们的处理和用于 montreal force aligner 的不一样。我们把声调分了出来。这是一个处理方式,当然也可以只做声母和韵母的切分。
|
||||
|
||||
运行脚本处理转录文本。
|
||||
|
||||
```bash
|
||||
python preprocess_transcription.py --input=<input> --output=<output>
|
||||
```
|
||||
|
||||
默认的 input 是 `~/datasets/aishell3/train`,其中会包含 `label_train-set.txt` 文件,处理后的结果会 `metadata.yaml` 和 `metadata.pickle`. 前者是文本格式,方便查看,后者是二进制格式,方便直接读取。
|
||||
|
||||
### mel 频谱提取
|
||||
|
||||
对处理后的音频进行 mel 频谱的提取,并且以和音频文件夹同构的方式存储,存储格式是 `.npy` 文件。
|
||||
|
||||
```python
|
||||
python extract_mel.py --input=<intput> --output=<output>
|
||||
```
|
||||
|
||||
input 是处理后的音频所在的文件夹,output 是输出频谱的文件夹。
|
||||
|
||||
## 训练
|
||||
|
||||
运行脚本训练。
|
||||
|
||||
```python
|
||||
python train.py --data=<data> --output=<output> --device="gpu"
|
||||
```
|
||||
|
||||
我们的模型去掉了 tacotron2 模型中的 stop token prediction。因为实践中由于 stop token prediction 是一个正负样例比例极不平衡的问题,每个句子可能有几百帧对应负样例,只有一帧正样例,而且这个 stop token prediction 对音频静音的裁切十分敏感。我们转用 attention 的最高点到达 encoder 侧的最后一个符号为终止条件。
|
||||
|
||||
另外,为了加速模型的收敛,我们加上了 guided attention loss, 诱导 encoder-decoder 之间的 alignment 更快地呈现对角线。
|
||||
|
||||
可以使用 visualdl 查看训练过程的 log。
|
||||
|
||||
```bash
|
||||
visualdl --logdir=<output> --host=$HOSTNAME
|
||||
```
|
||||
|
||||
示例 training loss / validation loss 曲线如下。
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
<img src="images/alignment-step2000.png" alt="alignment-step2000" style="zoom:50%;" />
|
||||
|
||||
大约从训练 2000 步左右就从 validation 过程中产出的 alignement 中可以观察到模糊的对角线。随着训练步数增加,对角线会更加清晰。但因为 validation 也是以 teacher forcing 的方式进行的,所以要在真正的 auto regressive 合成中产出的 alignment 中观察到对角线,需要更长的时间。
|
||||
|
||||
## 预训练模型
|
||||
|
||||
预训练模型下载链接。[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip).
|
||||
|
||||
## 使用
|
||||
|
||||
本实验包含了一个简单的使用示例,用户可以替换作为参考的声音以及文本,用训练好的模型来合成语音。使用方式参考 [notebook](./voice_cloning.ipynb) 上的使用说明。
|
|
@ -0,0 +1,88 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from paddle.io import Dataset
|
||||
from parakeet.frontend import Vocab
|
||||
from parakeet.data import batch_text_id, batch_spec
|
||||
|
||||
from preprocess_transcription import _phones, _tones
|
||||
|
||||
voc_phones = Vocab(sorted(list(_phones)))
|
||||
print("vocab_phones:\n", voc_phones)
|
||||
voc_tones = Vocab(sorted(list(_tones)))
|
||||
print("vocab_tones:\n", voc_tones)
|
||||
|
||||
|
||||
class AiShell3(Dataset):
|
||||
"""Processed AiShell3 dataset."""
|
||||
|
||||
def __init__(self, root):
|
||||
super().__init__()
|
||||
self.root = Path(root).expanduser()
|
||||
self.embed_dir = self.root / "embed"
|
||||
self.mel_dir = self.root / "mel"
|
||||
|
||||
with open(self.root / "metadata.pickle", 'rb') as f:
|
||||
self.records = pickle.load(f)
|
||||
|
||||
def __getitem__(self, index):
|
||||
metadatum = self.records[index]
|
||||
sentence_id = metadatum["sentence_id"]
|
||||
speaker_id = sentence_id[:7]
|
||||
phones = metadatum["phones"]
|
||||
tones = metadatum["tones"]
|
||||
phones = np.array(
|
||||
[voc_phones.lookup(item) for item in phones], dtype=np.int64)
|
||||
tones = np.array(
|
||||
[voc_tones.lookup(item) for item in tones], dtype=np.int64)
|
||||
mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
|
||||
embed = np.load(
|
||||
str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
|
||||
return phones, tones, mel, embed
|
||||
|
||||
def __len__(self):
|
||||
return len(self.records)
|
||||
|
||||
|
||||
def collate_aishell3_examples(examples):
|
||||
phones, tones, mel, embed = list(zip(*examples))
|
||||
|
||||
text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
|
||||
spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
|
||||
T_dec = np.max(spec_lengths)
|
||||
stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)
|
||||
).astype(np.float32)
|
||||
phones, _ = batch_text_id(phones)
|
||||
tones, _ = batch_text_id(tones)
|
||||
mel, _ = batch_spec(mel)
|
||||
mel = np.transpose(mel, (0, 2, 1))
|
||||
embed = np.stack(embed)
|
||||
# 7 fields
|
||||
# (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T)
|
||||
return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dataset = AiShell3("~/datasets/aishell3/train")
|
||||
example = dataset[0]
|
||||
|
||||
examples = [dataset[i] for i in range(10)]
|
||||
batch = collate_aishell3_examples(examples)
|
||||
|
||||
for field in batch:
|
||||
print(field.shape, field.dtype)
|
|
@ -0,0 +1,39 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import List, Tuple
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
from preprocess_transcription import split_syllable
|
||||
|
||||
|
||||
def convert_to_pinyin(text: str) -> List[str]:
|
||||
"""convert text into list of syllables, other characters that are not chinese, thus
|
||||
cannot be converted to pinyin are splited.
|
||||
"""
|
||||
syllables = lazy_pinyin(
|
||||
text, style=Style.TONE3, neutral_tone_with_five=True)
|
||||
return syllables
|
||||
|
||||
|
||||
def convert_sentence(text: str) -> List[Tuple[str]]:
|
||||
"""convert a sentence into two list: phones and tones"""
|
||||
syllables = convert_to_pinyin(text)
|
||||
phones = []
|
||||
tones = []
|
||||
for syllable in syllables:
|
||||
p, t = split_syllable(syllable)
|
||||
phones.extend(p)
|
||||
tones.extend(t)
|
||||
|
||||
return phones, tones
|
|
@ -0,0 +1,82 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from yacs.config import CfgNode as CN
|
||||
|
||||
_C = CN()
|
||||
_C.data = CN(
|
||||
dict(
|
||||
batch_size=32, # batch size
|
||||
valid_size=64, # the first N examples are reserved for validation
|
||||
sample_rate=22050, # Hz, sample rate
|
||||
n_fft=1024, # fft frame size
|
||||
win_length=1024, # window size
|
||||
hop_length=256, # hop size between ajacent frame
|
||||
fmax=8000, # Hz, max frequency when converting to mel
|
||||
fmin=0, # Hz, min frequency when converting to mel
|
||||
d_mels=80, # mel bands
|
||||
padding_idx=0, # text embedding's padding index
|
||||
))
|
||||
|
||||
_C.model = CN(
|
||||
dict(
|
||||
vocab_size=70,
|
||||
n_tones=10,
|
||||
reduction_factor=1, # reduction factor
|
||||
d_encoder=512, # embedding & encoder's internal size
|
||||
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
|
||||
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
|
||||
d_prenet=256, # hidden size of decoder prenet
|
||||
# hidden size of the first rnn layer in tacotron2 decoder
|
||||
d_attention_rnn=1024,
|
||||
# hidden size of the second rnn layer in tacotron2 decoder
|
||||
d_decoder_rnn=1024,
|
||||
d_attention=128, # hidden size of decoder location linear layer
|
||||
attention_filters=32, # number of filter in decoder location conv layer
|
||||
attention_kernel_size=31, # kernel size of decoder location conv layer
|
||||
d_postnet=512, # hidden size of decoder postnet
|
||||
postnet_kernel_size=5, # kernel size of conv layers in postnet
|
||||
postnet_conv_layers=5, # number of conv layer in decoder postnet
|
||||
p_encoder_dropout=0.5, # droput probability in encoder
|
||||
p_prenet_dropout=0.5, # droput probability in decoder prenet
|
||||
|
||||
# droput probability of first rnn layer in decoder
|
||||
p_attention_dropout=0.1,
|
||||
# droput probability of second rnn layer in decoder
|
||||
p_decoder_dropout=0.1,
|
||||
p_postnet_dropout=0.5, # droput probability in decoder postnet
|
||||
guided_attention_loss_sigma=0.2,
|
||||
d_global_condition=256,
|
||||
|
||||
# whether to use a classifier to predict stop probability
|
||||
use_stop_token=False,
|
||||
# whether to use guided attention loss in training
|
||||
use_guided_attention_loss=True, ))
|
||||
|
||||
_C.training = CN(
|
||||
dict(
|
||||
lr=1e-3, # learning rate
|
||||
weight_decay=1e-6, # the coeff of weight decay
|
||||
grad_clip_thresh=1.0, # the clip norm of grad clip.
|
||||
valid_interval=1000, # validation
|
||||
save_interval=1000, # checkpoint
|
||||
max_iteration=500000, # max iteration to train
|
||||
))
|
||||
|
||||
|
||||
def get_cfg_defaults():
|
||||
"""Get a yacs CfgNode object with default values for my_project."""
|
||||
# Return a clone so that the defaults will not be altered
|
||||
# This is for the "local variable" use pattern
|
||||
return _C.clone()
|
|
@ -0,0 +1,96 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import multiprocessing as mp
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from parakeet.audio import AudioProcessor
|
||||
from parakeet.audio.spec_normalizer import NormalizerBase, LogMagnitude
|
||||
|
||||
import tqdm
|
||||
|
||||
from config import get_cfg_defaults
|
||||
|
||||
|
||||
def extract_mel(fname: Path,
|
||||
input_dir: Path,
|
||||
output_dir: Path,
|
||||
p: AudioProcessor,
|
||||
n: NormalizerBase):
|
||||
relative_path = fname.relative_to(input_dir)
|
||||
out_path = (output_dir / relative_path).with_suffix(".npy")
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
wav = p.read_wav(fname)
|
||||
mel = p.mel_spectrogram(wav)
|
||||
mel = n.transform(mel)
|
||||
np.save(out_path, mel)
|
||||
|
||||
|
||||
def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
|
||||
input_dir = Path(input_dir).expanduser()
|
||||
fnames = list(input_dir.rglob(f"*{extension}"))
|
||||
output_dir = Path(output_dir).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
|
||||
config.hop_length, config.n_mels, config.fmin,
|
||||
config.fmax)
|
||||
n = LogMagnitude(1e-5)
|
||||
|
||||
func = partial(
|
||||
extract_mel, input_dir=input_dir, output_dir=output_dir, p=p, n=n)
|
||||
|
||||
with mp.Pool(16) as pool:
|
||||
list(
|
||||
tqdm.tqdm(
|
||||
pool.imap(func, fnames), total=len(fnames), unit="utterance"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract mel spectrogram from processed wav in AiShell3 training dataset."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
help="yaml config file to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/normalized_wav",
|
||||
help="path of the processed wav folder")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/mel",
|
||||
help="path of the folder to save mel spectrograms")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
default_config = get_cfg_defaults()
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
default_config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
default_config.merge_from_list(args.opts)
|
||||
default_config.freeze()
|
||||
audio_config = default_config.data
|
||||
|
||||
extract_mel_multispeaker(audio_config, args.input, args.output)
|
Binary file not shown.
After Width: | Height: | Size: 221 KiB |
Binary file not shown.
After Width: | Height: | Size: 550 KiB |
Binary file not shown.
After Width: | Height: | Size: 514 KiB |
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,258 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import re
|
||||
import pickle
|
||||
|
||||
import yaml
|
||||
import tqdm
|
||||
|
||||
zh_pattern = re.compile("[\u4e00-\u9fa5]")
|
||||
|
||||
_tones = {'<pad>', '<s>', '</s>', '0', '1', '2', '3', '4', '5'}
|
||||
|
||||
_pauses = {'%', '$'}
|
||||
|
||||
_initials = {
|
||||
'b',
|
||||
'p',
|
||||
'm',
|
||||
'f',
|
||||
'd',
|
||||
't',
|
||||
'n',
|
||||
'l',
|
||||
'g',
|
||||
'k',
|
||||
'h',
|
||||
'j',
|
||||
'q',
|
||||
'x',
|
||||
'zh',
|
||||
'ch',
|
||||
'sh',
|
||||
'r',
|
||||
'z',
|
||||
'c',
|
||||
's',
|
||||
}
|
||||
|
||||
_finals = {
|
||||
'ii',
|
||||
'iii',
|
||||
'a',
|
||||
'o',
|
||||
'e',
|
||||
'ea',
|
||||
'ai',
|
||||
'ei',
|
||||
'ao',
|
||||
'ou',
|
||||
'an',
|
||||
'en',
|
||||
'ang',
|
||||
'eng',
|
||||
'er',
|
||||
'i',
|
||||
'ia',
|
||||
'io',
|
||||
'ie',
|
||||
'iai',
|
||||
'iao',
|
||||
'iou',
|
||||
'ian',
|
||||
'ien',
|
||||
'iang',
|
||||
'ieng',
|
||||
'u',
|
||||
'ua',
|
||||
'uo',
|
||||
'uai',
|
||||
'uei',
|
||||
'uan',
|
||||
'uen',
|
||||
'uang',
|
||||
'ueng',
|
||||
'v',
|
||||
've',
|
||||
'van',
|
||||
'ven',
|
||||
'veng',
|
||||
}
|
||||
|
||||
_ernized_symbol = {'&r'}
|
||||
|
||||
_specials = {'<pad>', '<unk>', '<s>', '</s>'}
|
||||
|
||||
_phones = _initials | _finals | _ernized_symbol | _specials | _pauses
|
||||
|
||||
|
||||
def is_zh(word):
|
||||
global zh_pattern
|
||||
match = zh_pattern.search(word)
|
||||
return match is not None
|
||||
|
||||
|
||||
def ernized(syllable):
|
||||
return syllable[:2] != "er" and syllable[-2] == 'r'
|
||||
|
||||
|
||||
def convert(syllable):
|
||||
# expansion of o -> uo
|
||||
syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
|
||||
# syllable = syllable.replace("bo", "buo").replace("po", "puo").replace("mo", "muo").replace("fo", "fuo")
|
||||
# expansion for iong, ong
|
||||
syllable = syllable.replace("iong", "veng").replace("ong", "ueng")
|
||||
|
||||
# expansion for ing, in
|
||||
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
|
||||
|
||||
# expansion for un, ui, iu
|
||||
syllable = syllable.replace("un", "uen").replace(
|
||||
"ui", "uei").replace("iu", "iou")
|
||||
|
||||
# rule for variants of i
|
||||
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
|
||||
.replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\
|
||||
.replace("ri", "riii")
|
||||
|
||||
# rule for y preceding i, u
|
||||
syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
|
||||
|
||||
# rule for w
|
||||
syllable = syllable.replace("wu", "u").replace("w", "u")
|
||||
|
||||
# rule for v following j, q, x
|
||||
syllable = syllable.replace("ju", "jv").replace("qu",
|
||||
"qv").replace("xu", "xv")
|
||||
|
||||
return syllable
|
||||
|
||||
|
||||
def split_syllable(syllable: str):
|
||||
"""Split a syllable in pinyin into a list of phones and a list of tones.
|
||||
Initials have no tone, represented by '0', while finals have tones from
|
||||
'1,2,3,4,5'.
|
||||
|
||||
e.g.
|
||||
|
||||
zhang -> ['zh', 'ang'], ['0', '1']
|
||||
"""
|
||||
if syllable in _pauses:
|
||||
# syllable, tone
|
||||
return [syllable], ['0']
|
||||
|
||||
tone = syllable[-1]
|
||||
syllable = convert(syllable[:-1])
|
||||
|
||||
phones = []
|
||||
tones = []
|
||||
|
||||
global _initials
|
||||
if syllable[:2] in _initials:
|
||||
phones.append(syllable[:2])
|
||||
tones.append('0')
|
||||
phones.append(syllable[2:])
|
||||
tones.append(tone)
|
||||
elif syllable[0] in _initials:
|
||||
phones.append(syllable[0])
|
||||
tones.append('0')
|
||||
phones.append(syllable[1:])
|
||||
tones.append(tone)
|
||||
else:
|
||||
phones.append(syllable)
|
||||
tones.append(tone)
|
||||
return phones, tones
|
||||
|
||||
|
||||
def load_aishell3_transcription(line: str):
|
||||
sentence_id, pinyin, text = line.strip().split("|")
|
||||
syllables = pinyin.strip().split()
|
||||
|
||||
results = []
|
||||
|
||||
for syllable in syllables:
|
||||
if syllable in _pauses:
|
||||
results.append(syllable)
|
||||
elif not ernized(syllable):
|
||||
results.append(syllable)
|
||||
else:
|
||||
results.append(syllable[:-2] + syllable[-1])
|
||||
results.append('&r5')
|
||||
|
||||
phones = []
|
||||
tones = []
|
||||
for syllable in results:
|
||||
p, t = split_syllable(syllable)
|
||||
phones.extend(p)
|
||||
tones.extend(t)
|
||||
for p in phones:
|
||||
assert p in _phones, p
|
||||
return {
|
||||
"sentence_id": sentence_id,
|
||||
"text": text,
|
||||
"syllables": results,
|
||||
"phones": phones,
|
||||
"tones": tones
|
||||
}
|
||||
|
||||
|
||||
def process_aishell3(dataset_root, output_dir):
|
||||
dataset_root = Path(dataset_root).expanduser()
|
||||
output_dir = Path(output_dir).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
prosody_label_path = dataset_root / "label_train-set.txt"
|
||||
with open(prosody_label_path, 'rt') as f:
|
||||
lines = [line.strip() for line in f]
|
||||
|
||||
records = lines[5:]
|
||||
|
||||
processed_records = []
|
||||
for record in tqdm.tqdm(records):
|
||||
new_record = load_aishell3_transcription(record)
|
||||
processed_records.append(new_record)
|
||||
print(new_record)
|
||||
|
||||
with open(output_dir / "metadata.pickle", 'wb') as f:
|
||||
pickle.dump(processed_records, f)
|
||||
|
||||
with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
|
||||
yaml.safe_dump(
|
||||
processed_records, f, default_flow_style=None, allow_unicode=True)
|
||||
|
||||
print("metadata done!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train",
|
||||
help="path of the training dataset,(contains a label_train-set.txt).")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
help="the directory to save the processed transcription."
|
||||
"If not provided, it would be the same as the input.")
|
||||
args = parser.parse_args()
|
||||
if args.output is None:
|
||||
args.output = args.input
|
||||
|
||||
process_aishell3(args.input, args.output)
|
|
@ -0,0 +1,96 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from multiprocessing import Pool
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import librosa
|
||||
import soundfile as sf
|
||||
from tqdm import tqdm
|
||||
from praatio import tgio
|
||||
|
||||
|
||||
def get_valid_part(fpath):
|
||||
f = tgio.openTextgrid(fpath)
|
||||
|
||||
start = 0
|
||||
phone_entry_list = f.tierDict['phones'].entryList
|
||||
first_entry = phone_entry_list[0]
|
||||
if first_entry.label == "sil":
|
||||
start = first_entry.end
|
||||
|
||||
last_entry = phone_entry_list[-1]
|
||||
if last_entry.label == "sp":
|
||||
end = last_entry.start
|
||||
else:
|
||||
end = last_entry.end
|
||||
return start, end
|
||||
|
||||
|
||||
def process_utterance(fpath, source_dir, target_dir, alignment_dir):
|
||||
rel_path = fpath.relative_to(source_dir)
|
||||
opath = target_dir / rel_path
|
||||
apath = (alignment_dir / rel_path).with_suffix(".TextGrid")
|
||||
opath.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
start, end = get_valid_part(apath)
|
||||
wav, _ = librosa.load(fpath, sr=22050, offset=start, duration=end - start)
|
||||
normalized_wav = wav / np.max(wav) * 0.999
|
||||
sf.write(opath, normalized_wav, samplerate=22050, subtype='PCM_16')
|
||||
# print(f"{fpath} => {opath}")
|
||||
|
||||
|
||||
def preprocess_aishell3(source_dir, target_dir, alignment_dir):
|
||||
source_dir = Path(source_dir).expanduser()
|
||||
target_dir = Path(target_dir).expanduser()
|
||||
alignment_dir = Path(alignment_dir).expanduser()
|
||||
|
||||
wav_paths = list(source_dir.rglob("*.wav"))
|
||||
print(f"there are {len(wav_paths)} audio files in total")
|
||||
fx = partial(
|
||||
process_utterance,
|
||||
source_dir=source_dir,
|
||||
target_dir=target_dir,
|
||||
alignment_dir=alignment_dir)
|
||||
with Pool(16) as p:
|
||||
list(
|
||||
tqdm(
|
||||
p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Process audio in AiShell3, trim silence according to the alignment "
|
||||
"files generated by MFA, and normalize volume by peak.")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/wav",
|
||||
help="path of the original audio folder in aishell3.")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/normalized_wav",
|
||||
help="path of the folder to save the processed audio files.")
|
||||
parser.add_argument(
|
||||
"--alignment",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/alignment",
|
||||
help="path of the alignment files.")
|
||||
args = parser.parse_args()
|
||||
|
||||
preprocess_aishell3(args.input, args.output, args.alignment)
|
|
@ -0,0 +1,263 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
import paddle
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader, DistributedBatchSampler
|
||||
|
||||
from parakeet.data import dataset
|
||||
from parakeet.training.cli import default_argument_parser
|
||||
from parakeet.training.experiment import ExperimentBase
|
||||
from parakeet.utils import display, mp_tools
|
||||
from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss
|
||||
|
||||
from config import get_cfg_defaults
|
||||
from aishell3 import AiShell3, collate_aishell3_examples
|
||||
|
||||
|
||||
class Experiment(ExperimentBase):
|
||||
def compute_losses(self, inputs, outputs):
|
||||
texts, tones, mel_targets, utterance_embeds, text_lens, output_lens, stop_tokens = inputs
|
||||
|
||||
mel_outputs = outputs["mel_output"]
|
||||
mel_outputs_postnet = outputs["mel_outputs_postnet"]
|
||||
alignments = outputs["alignments"]
|
||||
|
||||
losses = self.criterion(mel_outputs, mel_outputs_postnet, mel_targets,
|
||||
alignments, output_lens, text_lens)
|
||||
return losses
|
||||
|
||||
def train_batch(self):
|
||||
start = time.time()
|
||||
batch = self.read_batch()
|
||||
data_loader_time = time.time() - start
|
||||
|
||||
self.optimizer.clear_grad()
|
||||
self.model.train()
|
||||
texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
|
||||
outputs = self.model(
|
||||
texts,
|
||||
text_lens,
|
||||
mels,
|
||||
output_lens,
|
||||
tones=tones,
|
||||
global_condition=utterance_embeds)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
loss = losses["loss"]
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
iteration_time = time.time() - start
|
||||
|
||||
losses_np = {k: float(v) for k, v in losses.items()}
|
||||
# logging
|
||||
msg = "Rank: {}, ".format(dist.get_rank())
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||
iteration_time)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_np.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
for key, value in losses_np.items():
|
||||
self.visualizer.add_scalar(f"train_loss/{key}", value,
|
||||
self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def valid(self):
|
||||
valid_losses = defaultdict(list)
|
||||
for i, batch in enumerate(self.valid_loader):
|
||||
texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
|
||||
outputs = self.model(
|
||||
texts,
|
||||
text_lens,
|
||||
mels,
|
||||
output_lens,
|
||||
tones=tones,
|
||||
global_condition=utterance_embeds)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
for key, value in losses.items():
|
||||
valid_losses[key].append(float(value))
|
||||
|
||||
attention_weights = outputs["alignments"]
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_alignments",
|
||||
display.plot_alignment(attention_weights[0].numpy().T),
|
||||
self.iteration)
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_target_spectrogram",
|
||||
display.plot_spectrogram(mels[0].numpy().T), self.iteration)
|
||||
mel_pred = outputs['mel_outputs_postnet']
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_predicted_spectrogram",
|
||||
display.plot_spectrogram(mel_pred[0].numpy().T),
|
||||
self.iteration)
|
||||
|
||||
# write visual log
|
||||
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||
|
||||
# logging
|
||||
msg = "Valid: "
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in valid_losses.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
for key, value in valid_losses.items():
|
||||
self.visualizer.add_scalar(f"valid/{key}", value, self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def eval(self):
|
||||
"""Evaluation of Tacotron2 in autoregressive manner."""
|
||||
self.model.eval()
|
||||
mel_dir = Path(self.output_dir / ("eval_{}".format(self.iteration)))
|
||||
mel_dir.mkdir(parents=True, exist_ok=True)
|
||||
for i, batch in enumerate(self.test_loader):
|
||||
texts, tones, mels, utterance_embeds, *_ = batch
|
||||
outputs = self.model.infer(
|
||||
texts, tones=tones, global_condition=utterance_embeds)
|
||||
|
||||
display.plot_alignment(outputs["alignments"][0].numpy().T)
|
||||
plt.savefig(mel_dir / f"sentence_{i}.png")
|
||||
plt.close()
|
||||
np.save(mel_dir / f"sentence_{i}",
|
||||
outputs["mel_outputs_postnet"][0].numpy().T)
|
||||
print(f"sentence_{i}")
|
||||
|
||||
def setup_model(self):
|
||||
config = self.config
|
||||
model = Tacotron2(
|
||||
vocab_size=config.model.vocab_size,
|
||||
n_tones=config.model.n_tones,
|
||||
d_mels=config.data.d_mels,
|
||||
d_encoder=config.model.d_encoder,
|
||||
encoder_conv_layers=config.model.encoder_conv_layers,
|
||||
encoder_kernel_size=config.model.encoder_kernel_size,
|
||||
d_prenet=config.model.d_prenet,
|
||||
d_attention_rnn=config.model.d_attention_rnn,
|
||||
d_decoder_rnn=config.model.d_decoder_rnn,
|
||||
attention_filters=config.model.attention_filters,
|
||||
attention_kernel_size=config.model.attention_kernel_size,
|
||||
d_attention=config.model.d_attention,
|
||||
d_postnet=config.model.d_postnet,
|
||||
postnet_kernel_size=config.model.postnet_kernel_size,
|
||||
postnet_conv_layers=config.model.postnet_conv_layers,
|
||||
reduction_factor=config.model.reduction_factor,
|
||||
p_encoder_dropout=config.model.p_encoder_dropout,
|
||||
p_prenet_dropout=config.model.p_prenet_dropout,
|
||||
p_attention_dropout=config.model.p_attention_dropout,
|
||||
p_decoder_dropout=config.model.p_decoder_dropout,
|
||||
p_postnet_dropout=config.model.p_postnet_dropout,
|
||||
d_global_condition=config.model.d_global_condition,
|
||||
use_stop_token=config.model.use_stop_token, )
|
||||
|
||||
if self.parallel:
|
||||
model = paddle.DataParallel(model)
|
||||
|
||||
grad_clip = paddle.nn.ClipGradByGlobalNorm(
|
||||
config.training.grad_clip_thresh)
|
||||
optimizer = paddle.optimizer.Adam(
|
||||
learning_rate=config.training.lr,
|
||||
parameters=model.parameters(),
|
||||
weight_decay=paddle.regularizer.L2Decay(
|
||||
config.training.weight_decay),
|
||||
grad_clip=grad_clip)
|
||||
criterion = Tacotron2Loss(
|
||||
use_stop_token_loss=config.model.use_stop_token,
|
||||
use_guided_attention_loss=config.model.use_guided_attention_loss,
|
||||
sigma=config.model.guided_attention_loss_sigma)
|
||||
self.model = model
|
||||
self.optimizer = optimizer
|
||||
self.criterion = criterion
|
||||
|
||||
def setup_dataloader(self):
|
||||
args = self.args
|
||||
config = self.config
|
||||
ljspeech_dataset = AiShell3(args.data)
|
||||
|
||||
valid_set, train_set = dataset.split(ljspeech_dataset,
|
||||
config.data.valid_size)
|
||||
batch_fn = collate_aishell3_examples
|
||||
|
||||
if not self.parallel:
|
||||
self.train_loader = DataLoader(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
else:
|
||||
sampler = DistributedBatchSampler(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
self.train_loader = DataLoader(
|
||||
train_set, batch_sampler=sampler, collate_fn=batch_fn)
|
||||
|
||||
self.valid_loader = DataLoader(
|
||||
valid_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
|
||||
self.test_loader = DataLoader(
|
||||
valid_set,
|
||||
batch_size=1,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = Experiment(config, args)
|
||||
exp.setup()
|
||||
exp.resume_or_load()
|
||||
if not args.test:
|
||||
exp.run()
|
||||
else:
|
||||
exp.eval()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
if args.nprocs > 1 and args.device == "gpu":
|
||||
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
|
||||
else:
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
parser = default_argument_parser()
|
||||
parser.add_argument("--test", action="store_true")
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|
File diff suppressed because one or more lines are too long
|
@ -1,112 +1,52 @@
|
|||
# TransformerTTS
|
||||
|
||||
PaddlePaddle dynamic graph implementation of TransformerTTS, a neural TTS with Transformer. The implementation is based on [Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895).
|
||||
# TransformerTTS with LJSpeech
|
||||
|
||||
## Dataset
|
||||
|
||||
We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
|
||||
### Download the datasaet.
|
||||
|
||||
```bash
|
||||
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
|
||||
```
|
||||
|
||||
### Extract the dataset.
|
||||
|
||||
```bash
|
||||
tar xjvf LJSpeech-1.1.tar.bz2
|
||||
```
|
||||
|
||||
## Model Architecture
|
||||
### Preprocess the dataset.
|
||||
|
||||
<div align="center" name="TransformerTTS model architecture">
|
||||
<img src="./images/model_architecture.jpg" width=400 height=600 /> <br>
|
||||
</div>
|
||||
<div align="center" >
|
||||
TransformerTTS model architecture
|
||||
</div>
|
||||
|
||||
The model adopts the multi-head attention mechanism to replace the RNN structures and also the original attention mechanism in [Tacotron2](https://arxiv.org/abs/1712.05884). The model consists of two main parts, encoder and decoder. We also implement the CBHG model of Tacotron as the vocoder part and convert the spectrogram into raw wave using Griffin-Lim algorithm.
|
||||
|
||||
## Project Structure
|
||||
|
||||
```text
|
||||
├── config # yaml configuration files
|
||||
├── data.py # dataset and dataloader settings for LJSpeech
|
||||
├── synthesis.py # script to synthesize waveform from text
|
||||
├── train_transformer.py # script for transformer model training
|
||||
├── train_vocoder.py # script for vocoder model training
|
||||
```
|
||||
|
||||
## Saving & Loading
|
||||
|
||||
`train_transformer.py` and `train_vocoer.py` have 3 arguments in common, `--checkpoint`, `--iteration` and `--output`.
|
||||
|
||||
1. `--output` is the directory for saving results.
|
||||
During training, checkpoints are saved in `${output}/checkpoints` and tensorboard logs are saved in `${output}/log`.
|
||||
During synthesis, results are saved in `${output}/samples` and tensorboard log is save in `${output}/log`.
|
||||
|
||||
2. `--checkpoint` is the path of a checkpoint and `--iteration` is the target step. They are used to load checkpoints in the following way.
|
||||
|
||||
- If `--checkpoint` is provided, the checkpoint specified by `--checkpoint` is loaded.
|
||||
|
||||
- If `--checkpoint` is not provided, we try to load the checkpoint of the target step specified by `--iteration` from the `${output}/checkpoints/` directory, e.g. if given `--iteration 120000`, the checkpoint `${output}/checkpoints/step-120000.*` will be load.
|
||||
|
||||
- If both `--checkpoint` and `--iteration` are not provided, we try to load the latest checkpoint from `${output}/checkpoints/` directory.
|
||||
|
||||
## Train Transformer
|
||||
|
||||
TransformerTTS model can be trained by running ``train_transformer.py``.
|
||||
Assume the path to save the preprocessed dataset is `ljspeech_transformer_tts`. Run the command below to preprocess the dataset.
|
||||
|
||||
```bash
|
||||
python train_transformer.py \
|
||||
--use_gpu=1 \
|
||||
--data=${DATAPATH} \
|
||||
--output=${OUTPUTPATH} \
|
||||
--config='configs/ljspeech.yaml' \
|
||||
python preprocess.py --input=LJSpeech-1.1/ --output=ljspeech_transformer_tts
|
||||
```
|
||||
|
||||
Or you can run the script file directly.
|
||||
## Train the model
|
||||
|
||||
The training script requires 4 command line arguments.
|
||||
`--data` is the path of the training dataset, `--output` is the path of the output direcctory (we recommend to use a subdirectory in `runs` to manage different experiments.)
|
||||
|
||||
`--device` should be "cpu" or "gpu", `--nprocs` is the number of processes to train the model in parallel.
|
||||
|
||||
```bash
|
||||
sh train_transformer.sh
|
||||
python train.py --data=ljspeech_transformer_tts/ --output=runs/test --device="gpu" --nprocs=1
|
||||
```
|
||||
|
||||
If you want to train on multiple GPUs, you must start training in the following way.
|
||||
If you want distributed training, set a larger `--nprocs` (e.g. 4). Note that distributed training with cpu is not supported yet.
|
||||
|
||||
## Synthesize
|
||||
|
||||
Synthesize waveform. We assume the `--input` is a text file, one sentence per line, and `--output` is a directory to save the synthesized mel spectrogram(log magnitude) in `.npy` format. The mel spectrograms can be used with `Waveflow` to generate waveforms.
|
||||
|
||||
`--checkpoint_path` should be the path of the parameter file (`.pdparams`) to load. Note that the extention name `.pdparmas` is not included here.
|
||||
|
||||
`--device` specifies to device to run synthesis on.
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3
|
||||
python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog train_transformer.py \
|
||||
--use_gpu=1 \
|
||||
--data=${DATAPATH} \
|
||||
--output=${OUTPUTPATH} \
|
||||
--config='configs/ljspeech.yaml' \
|
||||
python synthesize.py --input=sentence.txt --output=mels/ --checkpoint_path='step-310000' --device="gpu" --verbose
|
||||
```
|
||||
|
||||
If you wish to resume from an existing model, See [Saving-&-Loading](#Saving-&-Loading) for details of checkpoint loading.
|
||||
## Pretrained Model
|
||||
|
||||
**Note: In order to ensure the training effect, we recommend using multi-GPU training to enlarge the batch size, and at least 16 samples in single batch per GPU.**
|
||||
|
||||
For more help on arguments
|
||||
|
||||
``python train_transformer.py --help``.
|
||||
|
||||
## Synthesis
|
||||
|
||||
After training the TransformerTTS, audio can be synthesized by running ``synthesis.py``.
|
||||
|
||||
```bash
|
||||
python synthesis.py \
|
||||
--use_gpu=0 \
|
||||
--output=${OUTPUTPATH} \
|
||||
--config='configs/ljspeech.yaml' \
|
||||
--checkpoint_transformer=${CHECKPOINTPATH} \
|
||||
--vocoder='griffin-lim' \
|
||||
```
|
||||
|
||||
We currently support two vocoders, Griffin-Lim algorithm and WaveFlow. You can set ``--vocoder`` to use one of them. If you want to use WaveFlow as your vocoder, you need to set ``--config_vocoder`` and ``--checkpoint_vocoder`` which are the path of the config and checkpoint of vocoder. You can download the pre-trained model of WaveFlow from [here](https://github.com/PaddlePaddle/Parakeet#vocoders).
|
||||
|
||||
Or you can run the script file directly.
|
||||
|
||||
```bash
|
||||
sh synthesis.sh
|
||||
```
|
||||
For more help on arguments
|
||||
|
||||
``python synthesis.py --help``.
|
||||
|
||||
Then you can find the synthesized audio files in ``${OUTPUTPATH}/samples``.
|
||||
Pretrained model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.3.zip).
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from yacs.config import CfgNode as CN
|
||||
|
||||
_C = CN()
|
||||
_C.data = CN(
|
||||
dict(
|
||||
batch_size=16, # batch size
|
||||
valid_size=64, # the first N examples are reserved for validation
|
||||
sample_rate=22050, # Hz, sample rate
|
||||
n_fft=1024, # fft frame size
|
||||
win_length=1024, # window size
|
||||
hop_length=256, # hop size between ajacent frame
|
||||
fmin=0, # Hz, min frequency when converting to mel
|
||||
fmax=8000, # Hz, max frequency when converting to mel
|
||||
n_mels=80, # mel bands
|
||||
padding_idx=0, # text embedding's padding index
|
||||
mel_start_value=0.5, # value for starting frame
|
||||
mel_end_value=-0.5, # # value for ending frame
|
||||
))
|
||||
|
||||
_C.model = CN(
|
||||
dict(
|
||||
d_encoder=512, # embedding & encoder's internal size
|
||||
d_decoder=256, # decoder's internal size
|
||||
n_heads=4, # actually it can differ at each layer
|
||||
d_ffn=1024, # encoder_d_ffn & decoder_d_ffn
|
||||
encoder_layers=4, # number of transformer encoder layer
|
||||
decoder_layers=4, # number of transformer decoder layer
|
||||
d_prenet=256, # decoder prenet's hidden size (n_mels=>d_prenet=>d_decoder)
|
||||
d_postnet=256, # decoder postnet(cnn)'s internal channel
|
||||
postnet_layers=5, # decoder postnet(cnn)'s layer
|
||||
postnet_kernel_size=5, # decoder postnet(cnn)'s kernel size
|
||||
max_reduction_factor=10, # max_reduction factor
|
||||
dropout=0.1, # global droput probability
|
||||
stop_loss_scale=8.0, # scaler for stop _loss
|
||||
decoder_prenet_dropout=0.5, # decoder prenet dropout probability
|
||||
))
|
||||
|
||||
_C.training = CN(
|
||||
dict(
|
||||
lr=1e-4, # learning rate
|
||||
drop_n_heads=[[0, 0], [15000, 1]],
|
||||
reduction_factor=[[0, 10], [80000, 4], [200000, 2]],
|
||||
plot_interval=1000, # plot attention and spectrogram
|
||||
valid_interval=1000, # validation
|
||||
save_interval=10000, # checkpoint
|
||||
max_iteration=500000, # max iteration to train
|
||||
))
|
||||
|
||||
|
||||
def get_cfg_defaults():
|
||||
"""Get a yacs CfgNode object with default values for my_project."""
|
||||
# Return a clone so that the defaults will not be altered
|
||||
# This is for the "local variable" use pattern
|
||||
return _C.clone()
|
|
@ -1,38 +0,0 @@
|
|||
audio:
|
||||
num_mels: 80
|
||||
n_fft: 1024
|
||||
sr: 22050
|
||||
preemphasis: 0.97
|
||||
hop_length: 256
|
||||
win_length: 1024
|
||||
power: 1.2
|
||||
fmin: 0
|
||||
fmax: 8000
|
||||
|
||||
network:
|
||||
hidden_size: 256
|
||||
embedding_size: 512
|
||||
encoder_num_head: 4
|
||||
encoder_n_layers: 3
|
||||
decoder_num_head: 4
|
||||
decoder_n_layers: 3
|
||||
outputs_per_step: 1
|
||||
stop_loss_weight: 8
|
||||
|
||||
vocoder:
|
||||
hidden_size: 256
|
||||
|
||||
train:
|
||||
batch_size: 32
|
||||
learning_rate: 0.001
|
||||
warm_up_step: 4000
|
||||
grad_clip_thresh: 1.0
|
||||
|
||||
checkpoint_interval: 1000
|
||||
image_interval: 2000
|
||||
|
||||
max_iteration: 500000
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,219 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import librosa
|
||||
import csv
|
||||
|
||||
from paddle import fluid
|
||||
from parakeet import g2p
|
||||
from parakeet.data.sampler import *
|
||||
from parakeet.data.datacargo import DataCargo
|
||||
from parakeet.data.batch import TextIDBatcher, SpecBatcher
|
||||
from parakeet.data.dataset import DatasetMixin, TransformDataset, CacheDataset, SliceDataset
|
||||
from parakeet.models.transformer_tts.utils import *
|
||||
|
||||
|
||||
class LJSpeechLoader:
|
||||
def __init__(self,
|
||||
config,
|
||||
place,
|
||||
data_path,
|
||||
batch_size,
|
||||
nranks,
|
||||
rank,
|
||||
is_vocoder=False,
|
||||
shuffle=True):
|
||||
|
||||
LJSPEECH_ROOT = Path(data_path)
|
||||
metadata = LJSpeechMetaData(LJSPEECH_ROOT)
|
||||
transformer = LJSpeech(config)
|
||||
dataset = TransformDataset(metadata, transformer)
|
||||
dataset = CacheDataset(dataset)
|
||||
|
||||
sampler = DistributedSampler(
|
||||
len(dataset), nranks, rank, shuffle=shuffle)
|
||||
|
||||
assert batch_size % nranks == 0
|
||||
each_bs = batch_size // nranks
|
||||
if is_vocoder:
|
||||
dataloader = DataCargo(
|
||||
dataset,
|
||||
sampler=sampler,
|
||||
batch_size=each_bs,
|
||||
shuffle=shuffle,
|
||||
batch_fn=batch_examples_vocoder,
|
||||
drop_last=True)
|
||||
else:
|
||||
dataloader = DataCargo(
|
||||
dataset,
|
||||
sampler=sampler,
|
||||
batch_size=each_bs,
|
||||
shuffle=shuffle,
|
||||
batch_fn=batch_examples,
|
||||
drop_last=True)
|
||||
self.reader = fluid.io.DataLoader.from_generator(
|
||||
capacity=32,
|
||||
iterable=True,
|
||||
use_double_buffer=True,
|
||||
return_list=True)
|
||||
self.reader.set_batch_generator(dataloader, place)
|
||||
|
||||
|
||||
class LJSpeechMetaData(DatasetMixin):
|
||||
def __init__(self, root):
|
||||
self.root = Path(root)
|
||||
self._wav_dir = self.root.joinpath("wavs")
|
||||
csv_path = self.root.joinpath("metadata.csv")
|
||||
self._table = pd.read_csv(
|
||||
csv_path,
|
||||
sep="|",
|
||||
header=None,
|
||||
quoting=csv.QUOTE_NONE,
|
||||
names=["fname", "raw_text", "normalized_text"])
|
||||
|
||||
def get_example(self, i):
|
||||
fname, raw_text, normalized_text = self._table.iloc[i]
|
||||
fname = str(self._wav_dir.joinpath(fname + ".wav"))
|
||||
return fname, raw_text, normalized_text
|
||||
|
||||
def __len__(self):
|
||||
return len(self._table)
|
||||
|
||||
|
||||
class LJSpeech(object):
|
||||
def __init__(self, config):
|
||||
super(LJSpeech, self).__init__()
|
||||
self.config = config
|
||||
self.sr = config['sr']
|
||||
self.n_mels = config['num_mels']
|
||||
self.preemphasis = config['preemphasis']
|
||||
self.n_fft = config['n_fft']
|
||||
self.win_length = config['win_length']
|
||||
self.hop_length = config['hop_length']
|
||||
self.fmin = config['fmin']
|
||||
self.fmax = config['fmax']
|
||||
|
||||
def __call__(self, metadatum):
|
||||
"""All the code for generating an Example from a metadatum. If you want a
|
||||
different preprocessing pipeline, you can override this method.
|
||||
This method may require several processor, each of which has a lot of options.
|
||||
In this case, you'd better pass a composed transform and pass it to the init
|
||||
method.
|
||||
"""
|
||||
fname, raw_text, normalized_text = metadatum
|
||||
|
||||
# load
|
||||
wav, _ = librosa.load(str(fname))
|
||||
|
||||
spec = librosa.stft(
|
||||
y=wav,
|
||||
n_fft=self.n_fft,
|
||||
win_length=self.win_length,
|
||||
hop_length=self.hop_length)
|
||||
mag = np.abs(spec)
|
||||
mel = librosa.filters.mel(sr=self.sr,
|
||||
n_fft=self.n_fft,
|
||||
n_mels=self.n_mels,
|
||||
fmin=self.fmin,
|
||||
fmax=self.fmax)
|
||||
mel = np.matmul(mel, mag)
|
||||
mel = np.log(np.maximum(mel, 1e-5))
|
||||
|
||||
characters = np.array(
|
||||
g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
|
||||
return (mag, mel, characters)
|
||||
|
||||
|
||||
def batch_examples(batch):
|
||||
texts = []
|
||||
mels = []
|
||||
mel_inputs = []
|
||||
text_lens = []
|
||||
pos_texts = []
|
||||
pos_mels = []
|
||||
stop_tokens = []
|
||||
for data in batch:
|
||||
_, mel, text = data
|
||||
mel_inputs.append(
|
||||
np.concatenate(
|
||||
[np.zeros([mel.shape[0], 1], np.float32), mel[:, :-1]],
|
||||
axis=-1))
|
||||
text_lens.append(len(text))
|
||||
pos_texts.append(np.arange(1, len(text) + 1))
|
||||
pos_mels.append(np.arange(1, mel.shape[1] + 1))
|
||||
mels.append(mel)
|
||||
texts.append(text)
|
||||
stop_token = np.append(np.zeros([mel.shape[1] - 1], np.float32), 1.0)
|
||||
stop_tokens.append(stop_token)
|
||||
|
||||
# Sort by text_len in descending order
|
||||
texts = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(texts, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
mels = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(mels, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
mel_inputs = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
pos_texts = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
pos_mels = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
stop_tokens = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(stop_tokens, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
text_lens = sorted(text_lens, reverse=True)
|
||||
|
||||
# Pad sequence with largest len of the batch
|
||||
texts = TextIDBatcher(pad_id=0)(texts) #(B, T)
|
||||
pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
|
||||
pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
|
||||
stop_tokens = TextIDBatcher(pad_id=1, dtype=np.float32)(pos_mels)
|
||||
mels = np.transpose(
|
||||
SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) #(B,T,num_mels)
|
||||
mel_inputs = np.transpose(
|
||||
SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1)) #(B,T,num_mels)
|
||||
|
||||
return (texts, mels, mel_inputs, pos_texts, pos_mels, stop_tokens)
|
||||
|
||||
|
||||
def batch_examples_vocoder(batch):
|
||||
mels = []
|
||||
mags = []
|
||||
for data in batch:
|
||||
mag, mel, _ = data
|
||||
mels.append(mel)
|
||||
mags.append(mag)
|
||||
|
||||
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))
|
||||
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0, 2, 1))
|
||||
|
||||
return (mels, mags)
|
Binary file not shown.
Before Width: | Height: | Size: 322 KiB |
|
@ -0,0 +1,81 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from pathlib import Path
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
from paddle.io import Dataset
|
||||
|
||||
from parakeet.data.batch import batch_spec, batch_text_id
|
||||
|
||||
|
||||
class LJSpeech(Dataset):
|
||||
"""A simple dataset adaptor for the processed ljspeech dataset."""
|
||||
|
||||
def __init__(self, root):
|
||||
self.root = Path(root).expanduser()
|
||||
records = []
|
||||
with open(self.root / "metadata.pkl", 'rb') as f:
|
||||
metadata = pickle.load(f)
|
||||
for mel_name, text, phonemes, ids in metadata:
|
||||
mel_name = self.root / "mel" / (mel_name + ".npy")
|
||||
records.append((mel_name, text, phonemes, ids))
|
||||
self.records = records
|
||||
|
||||
def __getitem__(self, i):
|
||||
mel_name, _, _, ids = self.records[i]
|
||||
mel = np.load(mel_name)
|
||||
return ids, mel
|
||||
|
||||
def __len__(self):
|
||||
return len(self.records)
|
||||
|
||||
|
||||
# decorate mel & create stop probability
|
||||
class Transform(object):
|
||||
def __init__(self, start_value, end_value):
|
||||
self.start_value = start_value
|
||||
self.end_value = end_value
|
||||
|
||||
def __call__(self, example):
|
||||
ids, mel = example # ids already have <s> and </s>
|
||||
ids = np.array(ids, dtype=np.int64)
|
||||
# add start and end frame
|
||||
mel = np.pad(mel, [(0, 0), (1, 1)],
|
||||
mode='constant',
|
||||
constant_values=[(0, 0),
|
||||
(self.start_value, self.end_value)])
|
||||
stop_labels = np.ones([mel.shape[1]], dtype=np.int64)
|
||||
stop_labels[-1] = 2
|
||||
# actually this thing can also be done within the model
|
||||
return ids, mel, stop_labels
|
||||
|
||||
|
||||
class LJSpeechCollector(object):
|
||||
"""A simple callable to batch LJSpeech examples."""
|
||||
|
||||
def __init__(self, padding_idx=0, padding_value=0.):
|
||||
self.padding_idx = padding_idx
|
||||
self.padding_value = padding_value
|
||||
|
||||
def __call__(self, examples):
|
||||
ids = [example[0] for example in examples]
|
||||
mels = [example[1] for example in examples]
|
||||
stop_probs = [example[2] for example in examples]
|
||||
|
||||
ids, _ = batch_text_id(ids, pad_id=self.padding_idx)
|
||||
mels, _ = batch_spec(mels, pad_value=self.padding_value)
|
||||
stop_probs, _ = batch_text_id(stop_probs, pad_id=self.padding_idx)
|
||||
return ids, np.transpose(mels, [0, 2, 1]), stop_probs
|
|
@ -0,0 +1,110 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import tqdm
|
||||
import numpy as np
|
||||
|
||||
from parakeet.datasets import LJSpeechMetaData
|
||||
from parakeet.audio import AudioProcessor, LogMagnitude
|
||||
from parakeet.frontend import English
|
||||
|
||||
from config import get_cfg_defaults
|
||||
|
||||
|
||||
def create_dataset(config, source_path, target_path, verbose=False):
|
||||
# create output dir
|
||||
target_path = Path(target_path).expanduser()
|
||||
mel_path = target_path / "mel"
|
||||
os.makedirs(mel_path, exist_ok=True)
|
||||
|
||||
meta_data = LJSpeechMetaData(source_path)
|
||||
frontend = English()
|
||||
processor = AudioProcessor(
|
||||
sample_rate=config.data.sample_rate,
|
||||
n_fft=config.data.n_fft,
|
||||
n_mels=config.data.n_mels,
|
||||
win_length=config.data.win_length,
|
||||
hop_length=config.data.hop_length,
|
||||
fmax=config.data.fmax,
|
||||
fmin=config.data.fmin)
|
||||
normalizer = LogMagnitude()
|
||||
|
||||
records = []
|
||||
for (fname, text, _) in tqdm.tqdm(meta_data):
|
||||
wav = processor.read_wav(fname)
|
||||
mel = processor.mel_spectrogram(wav)
|
||||
mel = normalizer.transform(mel)
|
||||
phonemes = frontend.phoneticize(text)
|
||||
ids = frontend.numericalize(phonemes)
|
||||
mel_name = os.path.splitext(os.path.basename(fname))[0]
|
||||
|
||||
# save mel spectrogram
|
||||
records.append((mel_name, text, phonemes, ids))
|
||||
np.save(mel_path / mel_name, mel)
|
||||
if verbose:
|
||||
print("save mel spectrograms into {}".format(mel_path))
|
||||
|
||||
# save meta data as pickle archive
|
||||
with open(target_path / "metadata.pkl", 'wb') as f:
|
||||
pickle.dump(records, f)
|
||||
if verbose:
|
||||
print("saved metadata into {}".format(target_path /
|
||||
"metadata.pkl"))
|
||||
|
||||
# also save meta data into text format for inspection
|
||||
with open(target_path / "metadata.txt", 'wt') as f:
|
||||
for mel_name, text, phonemes, _ in records:
|
||||
phoneme_str = "|".join(phonemes)
|
||||
f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str))
|
||||
if verbose:
|
||||
print("saved metadata into {}".format(target_path /
|
||||
"metadata.txt"))
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="create dataset")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--input", type=str, help="path of the ljspeech dataset")
|
||||
parser.add_argument(
|
||||
"--output", type=str, help="path to save output dataset")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
config = get_cfg_defaults()
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config.data)
|
||||
|
||||
create_dataset(config, args.input, args.output, args.verbose)
|
|
@ -1,202 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
from scipy.io.wavfile import write
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from matplotlib import cm
|
||||
from visualdl import LogWriter
|
||||
from ruamel import yaml
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
from pprint import pprint
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
from parakeet.g2p.en import text_to_sequence
|
||||
from parakeet.models.transformer_tts.utils import *
|
||||
from parakeet.models.transformer_tts import TransformerTTS
|
||||
from parakeet.models.waveflow import WaveFlowModule
|
||||
from parakeet.modules.weight_norm import WeightNormWrapper
|
||||
from parakeet.utils import io
|
||||
|
||||
|
||||
def add_config_options_to_parser(parser):
|
||||
parser.add_argument("--config", type=str, help="path of the config file")
|
||||
parser.add_argument("--use_gpu", type=int, default=0, help="device to use")
|
||||
parser.add_argument(
|
||||
"--stop_threshold",
|
||||
type=float,
|
||||
default=0.5,
|
||||
help="The threshold of stop token which indicates the time step should stop generate spectrum or not."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_len",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="The max length of spectrum when synthesize. If the length of synthetical spectrum is lager than max_len, spectrum will be cut off."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--checkpoint_transformer",
|
||||
type=str,
|
||||
help="transformer_tts checkpoint for synthesis")
|
||||
parser.add_argument(
|
||||
"--vocoder",
|
||||
type=str,
|
||||
default="griffin-lim",
|
||||
choices=['griffin-lim', 'waveflow'],
|
||||
help="vocoder method")
|
||||
parser.add_argument(
|
||||
"--config_vocoder", type=str, help="path of the vocoder config file")
|
||||
parser.add_argument(
|
||||
"--checkpoint_vocoder",
|
||||
type=str,
|
||||
help="vocoder checkpoint for synthesis")
|
||||
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="synthesis",
|
||||
help="path to save experiment results")
|
||||
|
||||
|
||||
def synthesis(text_input, args):
|
||||
local_rank = dg.parallel.Env().local_rank
|
||||
place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())
|
||||
|
||||
with open(args.config) as f:
|
||||
cfg = yaml.load(f, Loader=yaml.Loader)
|
||||
|
||||
# tensorboard
|
||||
if not os.path.exists(args.output):
|
||||
os.mkdir(args.output)
|
||||
|
||||
writer = LogWriter(os.path.join(args.output, 'log'))
|
||||
|
||||
fluid.enable_dygraph(place)
|
||||
with fluid.unique_name.guard():
|
||||
network_cfg = cfg['network']
|
||||
model = TransformerTTS(
|
||||
network_cfg['embedding_size'], network_cfg['hidden_size'],
|
||||
network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
|
||||
cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
|
||||
network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
|
||||
# Load parameters.
|
||||
global_step = io.load_parameters(
|
||||
model=model, checkpoint_path=args.checkpoint_transformer)
|
||||
model.eval()
|
||||
|
||||
# init input
|
||||
text = np.asarray(text_to_sequence(text_input))
|
||||
text = fluid.layers.unsqueeze(dg.to_variable(text).astype(np.int64), [0])
|
||||
mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
|
||||
pos_text = np.arange(1, text.shape[1] + 1)
|
||||
pos_text = fluid.layers.unsqueeze(
|
||||
dg.to_variable(pos_text).astype(np.int64), [0])
|
||||
|
||||
for i in range(args.max_len):
|
||||
pos_mel = np.arange(1, mel_input.shape[1] + 1)
|
||||
pos_mel = fluid.layers.unsqueeze(
|
||||
dg.to_variable(pos_mel).astype(np.int64), [0])
|
||||
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
|
||||
text, mel_input, pos_text, pos_mel)
|
||||
if stop_preds.numpy()[0, -1] > args.stop_threshold:
|
||||
break
|
||||
mel_input = fluid.layers.concat(
|
||||
[mel_input, postnet_pred[:, -1:, :]], axis=1)
|
||||
global_step = 0
|
||||
for i, prob in enumerate(attn_probs):
|
||||
for j in range(4):
|
||||
x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
|
||||
writer.add_image(
|
||||
'Attention_%d_0' % global_step,
|
||||
x,
|
||||
i * 4 + j)
|
||||
|
||||
if args.vocoder == 'griffin-lim':
|
||||
#synthesis use griffin-lim
|
||||
wav = synthesis_with_griffinlim(postnet_pred, cfg['audio'])
|
||||
elif args.vocoder == 'waveflow':
|
||||
# synthesis use waveflow
|
||||
wav = synthesis_with_waveflow(postnet_pred, args,
|
||||
args.checkpoint_vocoder, place)
|
||||
else:
|
||||
print(
|
||||
'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
|
||||
% args.vocoder)
|
||||
|
||||
writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0,
|
||||
cfg['audio']['sr'])
|
||||
if not os.path.exists(os.path.join(args.output, 'samples')):
|
||||
os.mkdir(os.path.join(args.output, 'samples'))
|
||||
write(
|
||||
os.path.join(
|
||||
os.path.join(args.output, 'samples'), args.vocoder + '.wav'),
|
||||
cfg['audio']['sr'], wav)
|
||||
print("Synthesis completed !!!")
|
||||
writer.close()
|
||||
|
||||
|
||||
def synthesis_with_griffinlim(mel_output, cfg):
|
||||
# synthesis with griffin-lim
|
||||
mel_output = fluid.layers.transpose(
|
||||
fluid.layers.squeeze(mel_output, [0]), [1, 0])
|
||||
mel_output = np.exp(mel_output.numpy())
|
||||
basis = librosa.filters.mel(cfg['sr'],
|
||||
cfg['n_fft'],
|
||||
cfg['num_mels'],
|
||||
fmin=cfg['fmin'],
|
||||
fmax=cfg['fmax'])
|
||||
inv_basis = np.linalg.pinv(basis)
|
||||
spec = np.maximum(1e-10, np.dot(inv_basis, mel_output))
|
||||
|
||||
wav = librosa.core.griffinlim(
|
||||
spec**cfg['power'],
|
||||
hop_length=cfg['hop_length'],
|
||||
win_length=cfg['win_length'])
|
||||
|
||||
return wav
|
||||
|
||||
|
||||
def synthesis_with_waveflow(mel_output, args, checkpoint, place):
|
||||
fluid.enable_dygraph(place)
|
||||
args.config = args.config_vocoder
|
||||
args.use_fp16 = False
|
||||
config = io.add_yaml_config_to_args(args)
|
||||
|
||||
mel_spectrogram = fluid.layers.transpose(
|
||||
fluid.layers.squeeze(mel_output, [0]), [1, 0])
|
||||
mel_spectrogram = fluid.layers.unsqueeze(mel_spectrogram, [0])
|
||||
|
||||
# Build model.
|
||||
waveflow = WaveFlowModule(config)
|
||||
io.load_parameters(model=waveflow, checkpoint_path=checkpoint)
|
||||
for layer in waveflow.sublayers():
|
||||
if isinstance(layer, WeightNormWrapper):
|
||||
layer.remove_weight_norm()
|
||||
|
||||
# Run model inference.
|
||||
wav = waveflow.synthesize(mel_spectrogram, sigma=config.sigma)
|
||||
return wav.numpy()[0]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Synthesis model")
|
||||
add_config_options_to_parser(parser)
|
||||
args = parser.parse_args()
|
||||
# Print the whole config setting.
|
||||
pprint(vars(args))
|
||||
synthesis(
|
||||
"Life was like a box of chocolates, you never know what you're gonna get.",
|
||||
args)
|
|
@ -1,17 +0,0 @@
|
|||
|
||||
# train model
|
||||
CUDA_VISIBLE_DEVICES=0 \
|
||||
python -u synthesis.py \
|
||||
--use_gpu=0 \
|
||||
--output='./synthesis' \
|
||||
--config='transformer_tts_ljspeech_ckpt_1.0/ljspeech.yaml' \
|
||||
--checkpoint_transformer='./transformer_tts_ljspeech_ckpt_1.0/step-120000' \
|
||||
--vocoder='waveflow' \
|
||||
--config_vocoder='./waveflow_res128_ljspeech_ckpt_1.0/waveflow_ljspeech.yaml' \
|
||||
--checkpoint_vocoder='./waveflow_res128_ljspeech_ckpt_1.0/step-2000000' \
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in training!"
|
||||
exit 1
|
||||
fi
|
||||
exit 0
|
|
@ -0,0 +1,103 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
from parakeet.frontend import English
|
||||
from parakeet.models.transformer_tts import TransformerTTS
|
||||
from parakeet.utils import display
|
||||
|
||||
from config import get_cfg_defaults
|
||||
|
||||
|
||||
def main(config, args):
|
||||
paddle.set_device(args.device)
|
||||
|
||||
# model
|
||||
frontend = English()
|
||||
model = TransformerTTS.from_pretrained(frontend, config,
|
||||
args.checkpoint_path)
|
||||
model.eval()
|
||||
|
||||
# inputs
|
||||
input_path = Path(args.input).expanduser()
|
||||
with open(input_path, "rt") as f:
|
||||
sentences = f.readlines()
|
||||
|
||||
output_dir = Path(args.output).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for i, sentence in enumerate(sentences):
|
||||
if args.verbose:
|
||||
print("text: ", sentence)
|
||||
print("phones: ", frontend.phoneticize(sentence))
|
||||
text_ids = paddle.to_tensor(frontend(sentence))
|
||||
text_ids = paddle.unsqueeze(text_ids, 0) # (1, T)
|
||||
|
||||
with paddle.no_grad():
|
||||
outputs = model.infer(text_ids, verbose=args.verbose)
|
||||
|
||||
mel_output = outputs["mel_output"][0].numpy()
|
||||
cross_attention_weights = outputs["cross_attention_weights"]
|
||||
attns = np.stack([attn[0].numpy() for attn in cross_attention_weights])
|
||||
attns = np.transpose(attns, [0, 1, 3, 2])
|
||||
display.plot_multilayer_multihead_alignments(attns)
|
||||
plt.savefig(str(output_dir / f"sentence_{i}.png"))
|
||||
|
||||
mel_output = mel_output.T #(C, T)
|
||||
np.save(str(output_dir / f"sentence_{i}"), mel_output)
|
||||
if args.verbose:
|
||||
print("spectrogram saved at {}".format(output_dir /
|
||||
f"sentence_{i}.npy"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="generate mel spectrogram with TransformerTTS.")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
||||
parser.add_argument("--input", type=str, help="path of the text sentences")
|
||||
parser.add_argument("--output", type=str, help="path to save outputs")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="cpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|
|
@ -0,0 +1,221 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import time
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader, DistributedBatchSampler
|
||||
|
||||
from parakeet.data import dataset
|
||||
from parakeet.frontend import English
|
||||
from parakeet.models.transformer_tts import TransformerTTS, TransformerTTSLoss
|
||||
from parakeet.utils import scheduler, mp_tools, display
|
||||
from parakeet.training.cli import default_argument_parser
|
||||
from parakeet.training.experiment import ExperimentBase
|
||||
|
||||
from config import get_cfg_defaults
|
||||
from ljspeech import LJSpeech, LJSpeechCollector, Transform
|
||||
|
||||
|
||||
class TransformerTTSExperiment(ExperimentBase):
|
||||
def setup_model(self):
|
||||
config = self.config
|
||||
frontend = English()
|
||||
model = TransformerTTS(
|
||||
frontend,
|
||||
d_encoder=config.model.d_encoder,
|
||||
d_decoder=config.model.d_decoder,
|
||||
d_mel=config.data.n_mels,
|
||||
n_heads=config.model.n_heads,
|
||||
d_ffn=config.model.d_ffn,
|
||||
encoder_layers=config.model.encoder_layers,
|
||||
decoder_layers=config.model.decoder_layers,
|
||||
d_prenet=config.model.d_prenet,
|
||||
d_postnet=config.model.d_postnet,
|
||||
postnet_layers=config.model.postnet_layers,
|
||||
postnet_kernel_size=config.model.postnet_kernel_size,
|
||||
max_reduction_factor=config.model.max_reduction_factor,
|
||||
decoder_prenet_dropout=config.model.decoder_prenet_dropout,
|
||||
dropout=config.model.dropout)
|
||||
if self.parallel:
|
||||
model = paddle.DataParallel(model)
|
||||
optimizer = paddle.optimizer.Adam(
|
||||
learning_rate=config.training.lr,
|
||||
beta1=0.9,
|
||||
beta2=0.98,
|
||||
epsilon=1e-9,
|
||||
parameters=model.parameters())
|
||||
criterion = TransformerTTSLoss(config.model.stop_loss_scale)
|
||||
drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
|
||||
reduction_factor = scheduler.StepWise(config.training.reduction_factor)
|
||||
|
||||
self.model = model
|
||||
self.optimizer = optimizer
|
||||
self.criterion = criterion
|
||||
self.drop_n_heads = drop_n_heads
|
||||
self.reduction_factor = reduction_factor
|
||||
|
||||
def setup_dataloader(self):
|
||||
args = self.args
|
||||
config = self.config
|
||||
|
||||
ljspeech_dataset = LJSpeech(args.data)
|
||||
transform = Transform(config.data.mel_start_value,
|
||||
config.data.mel_end_value)
|
||||
ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset,
|
||||
transform)
|
||||
valid_set, train_set = dataset.split(ljspeech_dataset,
|
||||
config.data.valid_size)
|
||||
batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
|
||||
|
||||
if not self.parallel:
|
||||
train_loader = DataLoader(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
else:
|
||||
sampler = DistributedBatchSampler(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
num_replicas=dist.get_world_size(),
|
||||
rank=dist.get_rank(),
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
train_loader = DataLoader(
|
||||
train_set, batch_sampler=sampler, collate_fn=batch_fn)
|
||||
|
||||
valid_loader = DataLoader(
|
||||
valid_set, batch_size=config.data.batch_size, collate_fn=batch_fn)
|
||||
|
||||
self.train_loader = train_loader
|
||||
self.valid_loader = valid_loader
|
||||
|
||||
def compute_outputs(self, text, mel):
|
||||
model_core = self.model._layers if self.parallel else self.model
|
||||
model_core.set_constants(
|
||||
self.reduction_factor(self.iteration),
|
||||
self.drop_n_heads(self.iteration))
|
||||
|
||||
mel_input = mel[:, :-1, :]
|
||||
reduced_mel_input = mel_input[:, ::model_core.r, :]
|
||||
outputs = self.model(text, reduced_mel_input)
|
||||
return outputs
|
||||
|
||||
def compute_losses(self, inputs, outputs):
|
||||
_, mel, stop_label = inputs
|
||||
mel_target = mel[:, 1:, :]
|
||||
stop_label_target = stop_label[:, 1:]
|
||||
|
||||
mel_output = outputs["mel_output"]
|
||||
mel_intermediate = outputs["mel_intermediate"]
|
||||
stop_logits = outputs["stop_logits"]
|
||||
|
||||
time_steps = mel_target.shape[1]
|
||||
losses = self.criterion(
|
||||
mel_output[:, :time_steps, :], mel_intermediate[:, :time_steps, :],
|
||||
mel_target, stop_logits[:, :time_steps, :], stop_label_target)
|
||||
return losses
|
||||
|
||||
def train_batch(self):
|
||||
start = time.time()
|
||||
batch = self.read_batch()
|
||||
data_loader_time = time.time() - start
|
||||
|
||||
self.optimizer.clear_grad()
|
||||
self.model.train()
|
||||
text, mel, stop_label = batch
|
||||
outputs = self.compute_outputs(text, mel)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
loss = losses["loss"]
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
iteration_time = time.time() - start
|
||||
|
||||
losses_np = {k: float(v) for k, v in losses.items()}
|
||||
# logging
|
||||
msg = "Rank: {}, ".format(dist.get_rank())
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||
iteration_time)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_np.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
for k, v in losses_np.items():
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v,
|
||||
self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def valid(self):
|
||||
self.model.eval()
|
||||
valid_losses = defaultdict(list)
|
||||
for i, batch in enumerate(self.valid_loader):
|
||||
text, mel, stop_label = batch
|
||||
outputs = self.compute_outputs(text, mel)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
for k, v in losses.items():
|
||||
valid_losses[k].append(float(v))
|
||||
|
||||
if i < 2:
|
||||
attention_weights = outputs["cross_attention_weights"]
|
||||
attention_weights = [
|
||||
np.transpose(item[0].numpy(), [0, 2, 1])
|
||||
for item in attention_weights
|
||||
]
|
||||
attention_weights = np.stack(attention_weights)
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_cross_attention_weights",
|
||||
display.plot_multilayer_multihead_alignments(
|
||||
attention_weights), self.iteration)
|
||||
|
||||
# write visual log
|
||||
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||
for k, v in valid_losses.items():
|
||||
self.visualizer.add_scalar(f"valid/{k}", v, self.iteration)
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = TransformerTTSExperiment(config, args)
|
||||
exp.setup()
|
||||
exp.resume_or_load()
|
||||
exp.run()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
if args.nprocs > 1 and args.device == "gpu":
|
||||
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
|
||||
else:
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
parser = default_argument_parser()
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|
|
@ -1,219 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
from visualdl import LogWriter
|
||||
from collections import OrderedDict
|
||||
import argparse
|
||||
from pprint import pprint
|
||||
from ruamel import yaml
|
||||
from matplotlib import cm
|
||||
import numpy as np
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
from parakeet.models.transformer_tts.utils import cross_entropy
|
||||
from data import LJSpeechLoader
|
||||
from parakeet.models.transformer_tts import TransformerTTS
|
||||
from parakeet.utils import io
|
||||
|
||||
|
||||
def add_config_options_to_parser(parser):
|
||||
parser.add_argument("--config", type=str, help="path of the config file")
|
||||
parser.add_argument("--use_gpu", type=int, default=0, help="device to use")
|
||||
parser.add_argument("--data", type=str, help="path of LJspeech dataset")
|
||||
|
||||
g = parser.add_mutually_exclusive_group()
|
||||
g.add_argument("--checkpoint", type=str, help="checkpoint to resume from")
|
||||
g.add_argument(
|
||||
"--iteration",
|
||||
type=int,
|
||||
help="the iteration of the checkpoint to load from output directory")
|
||||
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="experiment",
|
||||
help="path to save experiment results")
|
||||
|
||||
|
||||
def main(args):
|
||||
local_rank = dg.parallel.Env().local_rank
|
||||
nranks = dg.parallel.Env().nranks
|
||||
parallel = nranks > 1
|
||||
|
||||
with open(args.config) as f:
|
||||
cfg = yaml.load(f, Loader=yaml.Loader)
|
||||
|
||||
global_step = 0
|
||||
place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()
|
||||
|
||||
if not os.path.exists(args.output):
|
||||
os.mkdir(args.output)
|
||||
|
||||
writer = LogWriter(os.path.join(args.output,
|
||||
'log')) if local_rank == 0 else None
|
||||
|
||||
fluid.enable_dygraph(place)
|
||||
network_cfg = cfg['network']
|
||||
model = TransformerTTS(
|
||||
network_cfg['embedding_size'], network_cfg['hidden_size'],
|
||||
network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
|
||||
cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
|
||||
network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
|
||||
|
||||
model.train()
|
||||
optimizer = fluid.optimizer.AdamOptimizer(
|
||||
learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] *
|
||||
(cfg['train']['learning_rate']**2)),
|
||||
cfg['train']['warm_up_step']),
|
||||
parameter_list=model.parameters(),
|
||||
grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][
|
||||
'grad_clip_thresh']))
|
||||
|
||||
# Load parameters.
|
||||
global_step = io.load_parameters(
|
||||
model=model,
|
||||
optimizer=optimizer,
|
||||
checkpoint_dir=os.path.join(args.output, 'checkpoints'),
|
||||
iteration=args.iteration,
|
||||
checkpoint_path=args.checkpoint)
|
||||
print("Rank {}: checkpoint loaded.".format(local_rank))
|
||||
|
||||
if parallel:
|
||||
strategy = dg.parallel.prepare_context()
|
||||
model = fluid.dygraph.parallel.DataParallel(model, strategy)
|
||||
|
||||
reader = LJSpeechLoader(
|
||||
cfg['audio'],
|
||||
place,
|
||||
args.data,
|
||||
cfg['train']['batch_size'],
|
||||
nranks,
|
||||
local_rank,
|
||||
shuffle=True).reader
|
||||
|
||||
iterator = iter(tqdm(reader))
|
||||
|
||||
global_step += 1
|
||||
|
||||
while global_step <= cfg['train']['max_iteration']:
|
||||
try:
|
||||
batch = next(iterator)
|
||||
except StopIteration as e:
|
||||
iterator = iter(tqdm(reader))
|
||||
batch = next(iterator)
|
||||
|
||||
character, mel, mel_input, pos_text, pos_mel, stop_tokens = batch
|
||||
|
||||
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
|
||||
character, mel_input, pos_text, pos_mel)
|
||||
|
||||
mel_loss = layers.mean(
|
||||
layers.abs(layers.elementwise_sub(mel_pred, mel)))
|
||||
post_mel_loss = layers.mean(
|
||||
layers.abs(layers.elementwise_sub(postnet_pred, mel)))
|
||||
loss = mel_loss + post_mel_loss
|
||||
|
||||
stop_loss = cross_entropy(
|
||||
stop_preds, stop_tokens, weight=cfg['network']['stop_loss_weight'])
|
||||
loss = loss + stop_loss
|
||||
|
||||
if local_rank == 0:
|
||||
writer.add_scalar('training_loss/mel_loss',
|
||||
mel_loss.numpy(),
|
||||
global_step)
|
||||
writer.add_scalar('training_loss/post_mel_loss',
|
||||
post_mel_loss.numpy(),
|
||||
global_step)
|
||||
writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)
|
||||
|
||||
if parallel:
|
||||
writer.add_scalar('alphas/encoder_alpha',
|
||||
model._layers.encoder.alpha.numpy(),
|
||||
global_step)
|
||||
writer.add_scalar('alphas/decoder_alpha',
|
||||
model._layers.decoder.alpha.numpy(),
|
||||
global_step)
|
||||
else:
|
||||
writer.add_scalar('alphas/encoder_alpha',
|
||||
model.encoder.alpha.numpy(),
|
||||
global_step)
|
||||
writer.add_scalar('alphas/decoder_alpha',
|
||||
model.decoder.alpha.numpy(),
|
||||
global_step)
|
||||
|
||||
writer.add_scalar('learning_rate',
|
||||
optimizer._learning_rate.step().numpy(),
|
||||
global_step)
|
||||
|
||||
if global_step % cfg['train']['image_interval'] == 1:
|
||||
for i, prob in enumerate(attn_probs):
|
||||
for j in range(cfg['network']['decoder_num_head']):
|
||||
x = np.uint8(
|
||||
cm.viridis(prob.numpy()[j * cfg['train'][
|
||||
'batch_size'] // nranks]) * 255)
|
||||
writer.add_image(
|
||||
'Attention_%d_0' % global_step,
|
||||
x,
|
||||
i * 4 + j)
|
||||
|
||||
for i, prob in enumerate(attn_enc):
|
||||
for j in range(cfg['network']['encoder_num_head']):
|
||||
x = np.uint8(
|
||||
cm.viridis(prob.numpy()[j * cfg['train'][
|
||||
'batch_size'] // nranks]) * 255)
|
||||
writer.add_image(
|
||||
'Attention_enc_%d_0' % global_step,
|
||||
x,
|
||||
i * 4 + j)
|
||||
|
||||
for i, prob in enumerate(attn_dec):
|
||||
for j in range(cfg['network']['decoder_num_head']):
|
||||
x = np.uint8(
|
||||
cm.viridis(prob.numpy()[j * cfg['train'][
|
||||
'batch_size'] // nranks]) * 255)
|
||||
writer.add_image(
|
||||
'Attention_dec_%d_0' % global_step,
|
||||
x,
|
||||
i * 4 + j)
|
||||
|
||||
if parallel:
|
||||
loss = model.scale_loss(loss)
|
||||
loss.backward()
|
||||
model.apply_collective_grads()
|
||||
else:
|
||||
loss.backward()
|
||||
optimizer.minimize(loss)
|
||||
model.clear_gradients()
|
||||
|
||||
# save checkpoint
|
||||
if local_rank == 0 and global_step % cfg['train'][
|
||||
'checkpoint_interval'] == 0:
|
||||
io.save_parameters(
|
||||
os.path.join(args.output, 'checkpoints'), global_step, model,
|
||||
optimizer)
|
||||
global_step += 1
|
||||
|
||||
if local_rank == 0:
|
||||
writer.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Train TransformerTTS model")
|
||||
add_config_options_to_parser(parser)
|
||||
args = parser.parse_args()
|
||||
# Print the whole config setting.
|
||||
pprint(vars(args))
|
||||
main(args)
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue