Merge branch 'master' of upstream
This commit is contained in:
commit
f312b2f05c
101
README.md
101
README.md
|
@ -74,31 +74,72 @@ Entries to the introduction, and the launch of training and synthsis for differe
|
|||
|
||||
## Pre-trained models and audio samples
|
||||
|
||||
Parakeet also releases some well-trained parameters for the example models, which can be accessed in the following tables. Each column of these tables lists resources for one model, including the url link to the pre-trained model, the dataset that the model is trained on and the total training steps, and several synthesized audio samples based on the pre-trained model.
|
||||
Parakeet also releases some well-trained parameters for the example models, which can be accessed in the following tables. Each column of these tables lists resources for one model, including the url link to the pre-trained model, the dataset that the model is trained on, and synthesized audio samples based on the pre-trained model.
|
||||
|
||||
- Vocoders
|
||||
#### Vocoders
|
||||
|
||||
We provide the model checkpoints of WaveFlow with 64 and 128 residual channels, ClariNet and WaveNet.
|
||||
|
||||
<div align="center">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th style="width: 250px">
|
||||
WaveFlow
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_ckpt_1.0.zip">WaveFlow (res. channels 64)</a>
|
||||
</th>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/clarinet_ljspeech_ckpt_1.0.zip">ClariNet</a>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_ckpt_1.0.zip">WaveFlow (res. channels 128)</a>
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>LJSpeech, 2M</th>
|
||||
<th>LJSpeech, 500K</th>
|
||||
<th>LJSpeech </th>
|
||||
<th>LJSpeech </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>
|
||||
To be added soon
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_samples_1.0/step_3020k_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_samples_1.0/step_3020k_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_samples_1.0/step_3020k_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_samples_1.0/step_3020k_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res64_ljspeech_samples_1.0/step_3020k_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
<th>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
</tr>
|
||||
</tbody>
|
||||
<thead>
|
||||
<tr>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/clarinet_ljspeech_ckpt_1.0.zip">ClariNet</a>
|
||||
</th>
|
||||
<th style="width: 250px">
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_ckpt_1.0.zip">WaveNet</a>
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>LJSpeech </th>
|
||||
<th>LJSpeech </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/clarinet_ljspeech_samples_1.0/step_500000_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
|
@ -111,15 +152,57 @@ Parakeet also releases some well-trained parameters for the example models, whic
|
|||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/clarinet_ljspeech_samples_1.0/step_500000_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
<th>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_samples_1.0/step_2450k_sentence_0.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_samples_1.0/step_2450k_sentence_1.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_samples_1.0/step_2450k_sentence_2.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_samples_1.0/step_2450k_sentence_3.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a><br>
|
||||
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/wavenet_ljspeech_samples_1.0/step_2450k_sentence_4.wav">
|
||||
<img src="images/audio_icon.png" width=250 /></a>
|
||||
</th>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
|
||||
**Note:** The input mel spectrogams are from validation dataset, which are not seen during training.
|
||||
**Note:** The input mel spectrogams are from validation dataset, which are not seen during training.
|
||||
|
||||
- TTS models
|
||||
#### TTS models
|
||||
|
||||
<div align="center">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th style="width: 250px">
|
||||
Deep Voice 3
|
||||
</th>
|
||||
<th style="width: 250px">
|
||||
Transformer TTS
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>LJSpeech </th>
|
||||
<th>LJSpeech </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th style="height: 150px">
|
||||
To be added soon
|
||||
</th>
|
||||
<th >
|
||||
To be added soon
|
||||
</th>
|
||||
</tr>
|
||||
</tbody>
|
||||
<thead>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
Click each link to download, then you can get the compressed package which contains the pre-trained model and the `yaml` config describing how to train the model.
|
||||
|
||||
|
|
|
@ -22,47 +22,71 @@ tar xjvf LJSpeech-1.1.tar.bz2
|
|||
└── utils.py utility functions
|
||||
```
|
||||
|
||||
## Saving & Loading
|
||||
`train.py` and `synthesis.py` have 3 arguments in common, `--checkpooint`, `iteration` and `output`.
|
||||
|
||||
1. `output` is the directory for saving results.
|
||||
During training, checkpoints are saved in `checkpoints/` in `output` and tensorboard log is save in `log/` in `output`. Other possible outputs are saved in `states/` in `outuput`.
|
||||
During synthesizing, audio files and other possible outputs are save in `synthesis/` in `output`.
|
||||
So after training and synthesizing with the same output directory, the file structure of the output directory looks like this.
|
||||
|
||||
```text
|
||||
├── checkpoints/ # checkpoint directory (including *.pdparams, *.pdopt and a text file `checkpoint` that records the latest checkpoint)
|
||||
├── states/ # audio files generated at validation and other possible outputs
|
||||
├── log/ # tensorboard log
|
||||
└── synthesis/ # synthesized audio files and other possible outputs
|
||||
```
|
||||
|
||||
2. `--checkpoint` and `--iteration` for loading from existing checkpoint. Loading existing checkpoiont follows the following rule:
|
||||
If `--checkpoint` is provided, the checkpoint specified by `--checkpoint` is loaded.
|
||||
If `--checkpoint` is not provided, we try to load the model specified by `--iteration` from the checkpoint directory. If `--iteration` is not provided, we try to load the latested checkpoint from checkpoint directory.
|
||||
|
||||
## Train
|
||||
|
||||
Train the model using train.py, follow the usage displayed by `python train.py --help`.
|
||||
|
||||
```text
|
||||
usage: train.py [-h] [--config CONFIG] [--device DEVICE] [--output OUTPUT]
|
||||
[--data DATA] [--resume RESUME] [--wavenet WAVENET]
|
||||
usage: train.py [-h] [--config CONFIG] [--device DEVICE] [--data DATA]
|
||||
[--checkpoint CHECKPOINT | --iteration ITERATION]
|
||||
[--wavenet WAVENET]
|
||||
output
|
||||
|
||||
train a ClariNet model with LJspeech and a trained WaveNet model.
|
||||
Train a ClariNet model with LJspeech and a trained WaveNet model.
|
||||
|
||||
positional arguments:
|
||||
output path to save experiment results
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG path of the config file.
|
||||
--device DEVICE device to use.
|
||||
--output OUTPUT path to save student.
|
||||
--data DATA path of LJspeech dataset.
|
||||
--resume RESUME checkpoint to load from.
|
||||
--wavenet WAVENET wavenet checkpoint to use.
|
||||
```
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG path of the config file
|
||||
--device DEVICE device to use
|
||||
--data DATA path of LJspeech dataset
|
||||
--checkpoint CHECKPOINT checkpoint to resume from
|
||||
--iteration ITERATION the iteration of the checkpoint to load from output directory
|
||||
--wavenet WAVENET wavenet checkpoint to use
|
||||
|
||||
- `--config` is the configuration file to use. The provided configurations can be used directly. And you can change some values in the configuration file and train the model with a different config.
|
||||
- `--data` is the path of the LJSpeech dataset, the extracted folder from the downloaded archive (the folder which contains metadata.txt).
|
||||
- `--resume` is the path of the checkpoint. If it is provided, the model would load the checkpoint before trainig.
|
||||
- `--output` is the directory to save results, all result are saved in this directory. The structure of the output directory is shown below.
|
||||
|
||||
```text
|
||||
├── checkpoints # checkpoint
|
||||
├── states # audio files generated at validation
|
||||
└── log # tensorboard log
|
||||
```
|
||||
|
||||
- `--device` is the device (gpu id) to use for training. `-1` means CPU.
|
||||
- `--wavenet` is the path of the wavenet checkpoint to load. If you do not specify `--resume`, then this must be provided.
|
||||
- `--data` is the path of the LJSpeech dataset, the extracted folder from the downloaded archive (the folder which contains `metadata.txt`).
|
||||
|
||||
- `--checkpoint` is the path of the checkpoint.
|
||||
- `--iteration` is the iteration of the checkpoint to load from output directory.
|
||||
- `output` is the directory to save results, all result are saved in this directory.
|
||||
|
||||
Before you start training a ClariNet model, you should have trained a WaveNet model with single Gaussian output distribution. Make sure the config of the teacher model matches that of the trained model.
|
||||
See [Saving-&-Loading](#Saving-&-Loading) for details of checkpoint loading.
|
||||
|
||||
- `--wavenet` is the path of the wavenet checkpoint to load.
|
||||
When you start training a ClariNet model without loading form a ClariNet checkpoint, you should have trained a WaveNet model with single Gaussian output distribution. Make sure the config of the teacher model matches that of the trained wavenet model.
|
||||
|
||||
Example script:
|
||||
|
||||
```bash
|
||||
python train.py --config=./configs/clarinet_ljspeech.yaml --data=./LJSpeech-1.1/ --output=experiment --device=0 --conditioner=wavenet_checkpoint/conditioner --conditioner=wavenet_checkpoint/teacher
|
||||
python train.py
|
||||
--config=./configs/clarinet_ljspeech.yaml
|
||||
--data=./LJSpeech-1.1/
|
||||
--device=0
|
||||
--wavenet="wavenet-step-2000000"
|
||||
experiment
|
||||
```
|
||||
|
||||
You can monitor training log via tensorboard, using the script below.
|
||||
|
@ -75,29 +99,50 @@ tensorboard --logdir=.
|
|||
## Synthesis
|
||||
```text
|
||||
usage: synthesis.py [-h] [--config CONFIG] [--device DEVICE] [--data DATA]
|
||||
checkpoint output
|
||||
[--checkpoint CHECKPOINT | --iteration ITERATION]
|
||||
output
|
||||
|
||||
train a ClariNet model with LJspeech and a trained WaveNet model.
|
||||
Synthesize audio files from mel spectrogram in the validation set.
|
||||
|
||||
positional arguments:
|
||||
checkpoint checkpoint to load from.
|
||||
output path to save student.
|
||||
output path to save the synthesized audio
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG path of the config file.
|
||||
--device DEVICE device to use.
|
||||
--data DATA path of LJspeech dataset.
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG path of the config file
|
||||
--device DEVICE device to use.
|
||||
--data DATA path of LJspeech dataset
|
||||
--checkpoint CHECKPOINT checkpoint to resume from
|
||||
--iteration ITERATION the iteration of the checkpoint to load from output directory
|
||||
```
|
||||
|
||||
- `--config` is the configuration file to use. You should use the same configuration with which you train you model.
|
||||
- `--data` is the path of the LJspeech dataset. A dataset is not needed for synthesis, but since the input is mel spectrogram, we need to get mel spectrogram from audio files.
|
||||
- `checkpoint` is the checkpoint to load.
|
||||
- `output_path` is the directory to save results. The output path contains the generated audio files (`*.wav`).
|
||||
- `--device` is the device (gpu id) to use for training. `-1` means CPU.
|
||||
- `--data` is the path of the LJspeech dataset. In principle, a dataset is not needed for synthesis, but since the input is mel spectrogram, we need to get mel spectrogram from audio files.
|
||||
- `--checkpoint` is the checkpoint to load.
|
||||
- `--iteration` is the iteration of the checkpoint to load from output directory.
|
||||
- `output` is the directory to save synthesized audio. Audio file is saved in `synthesis/` in `output` directory.
|
||||
See [Saving-&-Loading](#Saving-&-Loading) for details of checkpoint loading.
|
||||
|
||||
|
||||
Example script:
|
||||
|
||||
```bash
|
||||
python synthesis.py --config=./configs/wavenet_single_gaussian.yaml --data=./LJSpeech-1.1/ --device=0 experiment/checkpoints/step_500000 generated
|
||||
python synthesis.py \
|
||||
--config=./configs/wavenet_single_gaussian.yaml \
|
||||
--data=./LJSpeech-1.1/ \
|
||||
--device=0 \
|
||||
--iteration=500000 \
|
||||
experiment
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```bash
|
||||
python synthesis.py \
|
||||
--config=./configs/wavenet_single_gaussian.yaml \
|
||||
--data=./LJSpeech-1.1/ \
|
||||
--device=0 \
|
||||
--checkpoint="experiment/checkpoints/step-500000" \
|
||||
experiment
|
||||
```
|
||||
|
|
|
@ -26,29 +26,41 @@ from tensorboardX import SummaryWriter
|
|||
import paddle.fluid.dygraph as dg
|
||||
from paddle import fluid
|
||||
|
||||
from parakeet.modules.weight_norm import WeightNormWrapper
|
||||
from parakeet.models.wavenet import WaveNet, UpsampleNet
|
||||
from parakeet.models.clarinet import STFT, Clarinet, ParallelWaveNet
|
||||
from parakeet.data import TransformDataset, SliceDataset, RandomSampler, SequentialSampler, DataCargo
|
||||
from parakeet.utils.layer_tools import summary, freeze
|
||||
from parakeet.utils import io
|
||||
|
||||
from utils import valid_model, eval_model, save_checkpoint, load_checkpoint, load_model
|
||||
from utils import eval_model
|
||||
sys.path.append("../wavenet")
|
||||
from data import LJSpeechMetaData, Transform, DataCollector
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="synthesize audio files from mel spectrogram in the validation set."
|
||||
description="Synthesize audio files from mel spectrogram in the validation set."
|
||||
)
|
||||
parser.add_argument("--config", type=str, help="path of the config file.")
|
||||
parser.add_argument("--config", type=str, help="path of the config file")
|
||||
parser.add_argument(
|
||||
"--device", type=int, default=-1, help="device to use.")
|
||||
parser.add_argument("--data", type=str, help="path of LJspeech dataset.")
|
||||
parser.add_argument("--data", type=str, help="path of LJspeech dataset")
|
||||
|
||||
g = parser.add_mutually_exclusive_group()
|
||||
g.add_argument("--checkpoint", type=str, help="checkpoint to resume from")
|
||||
g.add_argument(
|
||||
"--iteration",
|
||||
type=int,
|
||||
help="the iteration of the checkpoint to load from output directory")
|
||||
|
||||
parser.add_argument(
|
||||
"checkpoint", type=str, help="checkpoint to load from.")
|
||||
parser.add_argument(
|
||||
"output", type=str, default="experiment", help="path to save student.")
|
||||
"output",
|
||||
type=str,
|
||||
default="experiment",
|
||||
help="path to save the synthesized audio")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.config, 'rt') as f:
|
||||
config = ruamel.yaml.safe_load(f)
|
||||
|
||||
|
@ -136,17 +148,32 @@ if __name__ == "__main__":
|
|||
model = Clarinet(upsample_net, teacher, student, stft,
|
||||
student_log_scale_min, lmd)
|
||||
summary(model)
|
||||
load_model(model, args.checkpoint)
|
||||
|
||||
# loader
|
||||
train_loader = fluid.io.DataLoader.from_generator(
|
||||
capacity=10, return_list=True)
|
||||
train_loader.set_batch_generator(train_cargo, place)
|
||||
# load parameters
|
||||
if args.checkpoint is not None:
|
||||
# load from args.checkpoint
|
||||
iteration = io.load_parameters(
|
||||
model, checkpoint_path=args.checkpoint)
|
||||
else:
|
||||
# load from "args.output/checkpoints"
|
||||
checkpoint_dir = os.path.join(args.output, "checkpoints")
|
||||
iteration = io.load_parameters(
|
||||
model, checkpoint_dir=checkpoint_dir, iteration=args.iteration)
|
||||
assert iteration > 0, "A trained checkpoint is needed."
|
||||
|
||||
# make generation fast
|
||||
for sublayer in model.sublayers():
|
||||
if isinstance(sublayer, WeightNormWrapper):
|
||||
sublayer.remove_weight_norm()
|
||||
|
||||
# data loader
|
||||
valid_loader = fluid.io.DataLoader.from_generator(
|
||||
capacity=10, return_list=True)
|
||||
valid_loader.set_batch_generator(valid_cargo, place)
|
||||
|
||||
if not os.path.exists(args.output):
|
||||
os.makedirs(args.output)
|
||||
eval_model(model, valid_loader, args.output, sample_rate)
|
||||
# the directory to save audio files
|
||||
synthesis_dir = os.path.join(args.output, "synthesis")
|
||||
if not os.path.exists(synthesis_dir):
|
||||
os.makedirs(synthesis_dir)
|
||||
|
||||
eval_model(model, valid_loader, synthesis_dir, iteration, sample_rate)
|
||||
|
|
|
@ -30,31 +30,46 @@ from parakeet.models.wavenet import WaveNet, UpsampleNet
|
|||
from parakeet.models.clarinet import STFT, Clarinet, ParallelWaveNet
|
||||
from parakeet.data import TransformDataset, SliceDataset, RandomSampler, SequentialSampler, DataCargo
|
||||
from parakeet.utils.layer_tools import summary, freeze
|
||||
from parakeet.utils import io
|
||||
|
||||
from utils import make_output_tree, valid_model, save_checkpoint, load_checkpoint, load_wavenet
|
||||
from utils import make_output_tree, eval_model, load_wavenet
|
||||
|
||||
# import dataset from wavenet
|
||||
sys.path.append("../wavenet")
|
||||
from data import LJSpeechMetaData, Transform, DataCollector
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="train a clarinet model with LJspeech and a trained wavenet model."
|
||||
description="Train a ClariNet model with LJspeech and a trained WaveNet model."
|
||||
)
|
||||
parser.add_argument("--config", type=str, help="path of the config file.")
|
||||
parser.add_argument("--config", type=str, help="path of the config file")
|
||||
parser.add_argument("--device", type=int, default=-1, help="device to use")
|
||||
parser.add_argument("--data", type=str, help="path of LJspeech dataset")
|
||||
|
||||
g = parser.add_mutually_exclusive_group()
|
||||
g.add_argument("--checkpoint", type=str, help="checkpoint to resume from")
|
||||
g.add_argument(
|
||||
"--iteration",
|
||||
type=int,
|
||||
help="the iteration of the checkpoint to load from output directory")
|
||||
|
||||
parser.add_argument(
|
||||
"--device", type=int, default=-1, help="device to use.")
|
||||
"--wavenet", type=str, help="wavenet checkpoint to use")
|
||||
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
"output",
|
||||
type=str,
|
||||
default="experiment",
|
||||
help="path to save student.")
|
||||
parser.add_argument("--data", type=str, help="path of LJspeech dataset.")
|
||||
parser.add_argument("--resume", type=str, help="checkpoint to load from.")
|
||||
parser.add_argument(
|
||||
"--wavenet", type=str, help="wavenet checkpoint to use.")
|
||||
help="path to save experiment results")
|
||||
|
||||
args = parser.parse_args()
|
||||
with open(args.config, 'rt') as f:
|
||||
config = ruamel.yaml.safe_load(f)
|
||||
|
||||
print("Command Line args: ")
|
||||
for k, v in vars(args).items():
|
||||
print("{}: {}".format(k, v))
|
||||
|
||||
ljspeech_meta = LJSpeechMetaData(args.data)
|
||||
|
||||
data_config = config["data"]
|
||||
|
@ -154,12 +169,28 @@ if __name__ == "__main__":
|
|||
clipper = fluid.dygraph_grad_clip.GradClipByGlobalNorm(
|
||||
gradiant_max_norm)
|
||||
|
||||
assert args.wavenet or args.resume, "you should load from a trained wavenet or resume training; training without a trained wavenet is not recommended."
|
||||
if args.wavenet:
|
||||
load_wavenet(model, args.wavenet)
|
||||
# train
|
||||
max_iterations = train_config["max_iterations"]
|
||||
checkpoint_interval = train_config["checkpoint_interval"]
|
||||
eval_interval = train_config["eval_interval"]
|
||||
checkpoint_dir = os.path.join(args.output, "checkpoints")
|
||||
state_dir = os.path.join(args.output, "states")
|
||||
log_dir = os.path.join(args.output, "log")
|
||||
writer = SummaryWriter(log_dir)
|
||||
|
||||
if args.resume:
|
||||
load_checkpoint(model, optim, args.resume)
|
||||
if args.checkpoint is not None:
|
||||
iteration = io.load_parameters(
|
||||
model, optim, checkpoint_path=args.checkpoint)
|
||||
else:
|
||||
iteration = io.load_parameters(
|
||||
model,
|
||||
optim,
|
||||
checkpoint_dir=checkpoint_dir,
|
||||
iteration=args.iteration)
|
||||
|
||||
if iteration == 0:
|
||||
assert args.wavenet is not None, "When training afresh, a trained wavenet model should be provided."
|
||||
load_wavenet(model, args.wavenet)
|
||||
|
||||
# loader
|
||||
train_loader = fluid.io.DataLoader.from_generator(
|
||||
|
@ -170,52 +201,42 @@ if __name__ == "__main__":
|
|||
capacity=10, return_list=True)
|
||||
valid_loader.set_batch_generator(valid_cargo, place)
|
||||
|
||||
# train
|
||||
max_iterations = train_config["max_iterations"]
|
||||
checkpoint_interval = train_config["checkpoint_interval"]
|
||||
eval_interval = train_config["eval_interval"]
|
||||
checkpoint_dir = os.path.join(args.output, "checkpoints")
|
||||
state_dir = os.path.join(args.output, "states")
|
||||
log_dir = os.path.join(args.output, "log")
|
||||
writer = SummaryWriter(log_dir)
|
||||
|
||||
# training loop
|
||||
global_step = 1
|
||||
global_epoch = 1
|
||||
while global_step < max_iterations:
|
||||
epoch_loss = 0.
|
||||
for j, batch in tqdm(enumerate(train_loader), desc="[train]"):
|
||||
audios, mels, audio_starts = batch
|
||||
model.train()
|
||||
loss_dict = model(
|
||||
audios, mels, audio_starts, clip_kl=global_step > 500)
|
||||
global_step = iteration + 1
|
||||
iterator = iter(tqdm(train_loader))
|
||||
while global_step <= max_iterations:
|
||||
try:
|
||||
batch = next(iterator)
|
||||
except StopIteration as e:
|
||||
iterator = iter(tqdm(train_loader))
|
||||
batch = next(iterator)
|
||||
|
||||
writer.add_scalar("learning_rate",
|
||||
optim._learning_rate.step().numpy()[0],
|
||||
global_step)
|
||||
for k, v in loss_dict.items():
|
||||
writer.add_scalar("loss/{}".format(k),
|
||||
v.numpy()[0], global_step)
|
||||
audios, mels, audio_starts = batch
|
||||
model.train()
|
||||
loss_dict = model(
|
||||
audios, mels, audio_starts, clip_kl=global_step > 500)
|
||||
|
||||
l = loss_dict["loss"]
|
||||
step_loss = l.numpy()[0]
|
||||
print("[train] loss: {:<8.6f}".format(step_loss))
|
||||
epoch_loss += step_loss
|
||||
writer.add_scalar("learning_rate",
|
||||
optim._learning_rate.step().numpy()[0],
|
||||
global_step)
|
||||
for k, v in loss_dict.items():
|
||||
writer.add_scalar("loss/{}".format(k),
|
||||
v.numpy()[0], global_step)
|
||||
|
||||
l.backward()
|
||||
optim.minimize(l, grad_clip=clipper)
|
||||
optim.clear_gradients()
|
||||
l = loss_dict["loss"]
|
||||
step_loss = l.numpy()[0]
|
||||
print("[train] global_step: {} loss: {:<8.6f}".format(global_step,
|
||||
step_loss))
|
||||
|
||||
if global_step % eval_interval == 0:
|
||||
# evaluate on valid dataset
|
||||
valid_model(model, valid_loader, state_dir, global_step,
|
||||
sample_rate)
|
||||
if global_step % checkpoint_interval == 0:
|
||||
save_checkpoint(model, optim, checkpoint_dir, global_step)
|
||||
l.backward()
|
||||
optim.minimize(l, grad_clip=clipper)
|
||||
optim.clear_gradients()
|
||||
|
||||
global_step += 1
|
||||
if global_step % eval_interval == 0:
|
||||
# evaluate on valid dataset
|
||||
eval_model(model, valid_loader, state_dir, global_step,
|
||||
sample_rate)
|
||||
if global_step % checkpoint_interval == 0:
|
||||
io.save_parameters(checkpoint_dir, global_step, model, optim)
|
||||
|
||||
# epoch loss
|
||||
average_loss = epoch_loss / j
|
||||
writer.add_scalar("average_loss", average_loss, global_epoch)
|
||||
global_epoch += 1
|
||||
global_step += 1
|
||||
|
|
|
@ -32,12 +32,12 @@ def make_output_tree(output_dir):
|
|||
os.makedirs(state_dir)
|
||||
|
||||
|
||||
def valid_model(model, valid_loader, output_dir, global_step, sample_rate):
|
||||
def eval_model(model, valid_loader, output_dir, iteration, sample_rate):
|
||||
model.eval()
|
||||
for i, batch in enumerate(valid_loader):
|
||||
# print("sentence {}".format(i))
|
||||
path = os.path.join(output_dir,
|
||||
"step_{}_sentence_{}.wav".format(global_step, i))
|
||||
"sentence_{}_step_{}.wav".format(i, iteration))
|
||||
audio_clips, mel_specs, audio_starts = batch
|
||||
wav_var = model.synthesis(mel_specs)
|
||||
wav_np = wav_var.numpy()[0]
|
||||
|
@ -45,42 +45,6 @@ def valid_model(model, valid_loader, output_dir, global_step, sample_rate):
|
|||
print("generated {}".format(path))
|
||||
|
||||
|
||||
def eval_model(model, valid_loader, output_dir, sample_rate):
|
||||
model.eval()
|
||||
for i, batch in enumerate(valid_loader):
|
||||
# print("sentence {}".format(i))
|
||||
path = os.path.join(output_dir, "sentence_{}.wav".format(i))
|
||||
audio_clips, mel_specs, audio_starts = batch
|
||||
wav_var = model.synthesis(mel_specs)
|
||||
wav_np = wav_var.numpy()[0]
|
||||
sf.write(path, wav_np, samplerate=sample_rate)
|
||||
print("generated {}".format(path))
|
||||
|
||||
|
||||
def save_checkpoint(model, optim, checkpoint_dir, global_step):
|
||||
path = os.path.join(checkpoint_dir, "step_{}".format(global_step))
|
||||
dg.save_dygraph(model.state_dict(), path)
|
||||
print("saving model to {}".format(path + ".pdparams"))
|
||||
if optim:
|
||||
dg.save_dygraph(optim.state_dict(), path)
|
||||
print("saving optimizer to {}".format(path + ".pdopt"))
|
||||
|
||||
|
||||
def load_model(model, path):
|
||||
model_dict, _ = dg.load_dygraph(path)
|
||||
model.set_dict(model_dict)
|
||||
print("loaded model from {}.pdparams".format(path))
|
||||
|
||||
|
||||
def load_checkpoint(model, optim, path):
|
||||
model_dict, optim_dict = dg.load_dygraph(path)
|
||||
model.set_dict(model_dict)
|
||||
print("loaded model from {}.pdparams".format(path))
|
||||
if optim_dict:
|
||||
optim.set_dict(optim_dict)
|
||||
print("loaded optimizer from {}.pdparams".format(path))
|
||||
|
||||
|
||||
def load_wavenet(model, path):
|
||||
wavenet_dict, _ = dg.load_dygraph(path)
|
||||
encoder_dict = OrderedDict()
|
||||
|
|
|
@ -30,32 +30,55 @@ The model consists of an encoder, a decoder and a converter (and a speaker embed
|
|||
└── utils.py utility functions
|
||||
```
|
||||
|
||||
## Saving & Loading
|
||||
`train.py` and `synthesis.py` have 3 arguments in common, `--checkpooint`, `iteration` and `output`.
|
||||
|
||||
1. `output` is the directory for saving results.
|
||||
During training, checkpoints are saved in `checkpoints/` in `output` and tensorboard log is save in `log/` in `output`. Other possible outputs are saved in `states/` in `outuput`.
|
||||
During synthesizing, audio files and other possible outputs are save in `synthesis/` in `output`.
|
||||
So after training and synthesizing with the same output directory, the file structure of the output directory looks like this.
|
||||
|
||||
```text
|
||||
├── checkpoints/ # checkpoint directory (including *.pdparams, *.pdopt and a text file `checkpoint` that records the latest checkpoint)
|
||||
├── states/ # audio files generated at validation and other possible outputs
|
||||
├── log/ # tensorboard log
|
||||
└── synthesis/ # synthesized audio files and other possible outputs
|
||||
```
|
||||
|
||||
2. `--checkpoint` and `--iteration` for loading from existing checkpoint. Loading existing checkpoiont follows the following rule:
|
||||
If `--checkpoint` is provided, the checkpoint specified by `--checkpoint` is loaded.
|
||||
If `--checkpoint` is not provided, we try to load the model specified by `--iteration` from the checkpoint directory. If `--iteration` is not provided, we try to load the latested checkpoint from checkpoint directory.
|
||||
|
||||
## Train
|
||||
|
||||
Train the model using train.py, follow the usage displayed by `python train.py --help`.
|
||||
|
||||
```text
|
||||
usage: train.py [-h] [-c CONFIG] [-s DATA] [-r RESUME] [-o OUTPUT] [-g DEVICE]
|
||||
usage: train.py [-h] [--config CONFIG] [--data DATA] [--device DEVICE]
|
||||
[--checkpoint CHECKPOINT | --iteration ITERATION]
|
||||
output
|
||||
|
||||
Train a Deep Voice 3 model with LJSpeech dataset.
|
||||
|
||||
positional arguments:
|
||||
output path to save results
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-c CONFIG, --config CONFIG
|
||||
experimrnt config
|
||||
-s DATA, --data DATA The path of the LJSpeech dataset.
|
||||
-r RESUME, --resume RESUME
|
||||
checkpoint to load
|
||||
-o OUTPUT, --output OUTPUT
|
||||
The directory to save result.
|
||||
-g DEVICE, --device DEVICE
|
||||
device to use
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG experimrnt config
|
||||
--data DATA The path of the LJSpeech dataset.
|
||||
--device DEVICE device to use
|
||||
--checkpoint CHECKPOINT checkpoint to resume from.
|
||||
--iteration ITERATION the iteration of the checkpoint to load from output directory
|
||||
```
|
||||
|
||||
- `--config` is the configuration file to use. The provided `ljspeech.yaml` can be used directly. And you can change some values in the configuration file and train the model with a different config.
|
||||
- `--data` is the path of the LJSpeech dataset, the extracted folder from the downloaded archive (the folder which contains metadata.txt).
|
||||
- `--resume` is the path of the checkpoint. If it is provided, the model would load the checkpoint before trainig.
|
||||
- `--output` is the directory to save results, all results are saved in this directory. The structure of the output directory is shown below.
|
||||
- `--device` is the device (gpu id) to use for training. `-1` means CPU.
|
||||
- `--checkpoint` is the path of the checkpoint.
|
||||
- `--iteration` is the iteration of the checkpoint to load from output directory.
|
||||
See [Saving-&-Loading](#Saving-&-Loading) for details of checkpoint loading.
|
||||
- `output` is the directory to save results, all results are saved in this directory. The structure of the output directory is shown below.
|
||||
|
||||
```text
|
||||
├── checkpoints # checkpoint
|
||||
|
@ -67,12 +90,14 @@ optional arguments:
|
|||
└── waveform # waveform (.wav files)
|
||||
```
|
||||
|
||||
- `--device` is the device (gpu id) to use for training. `-1` means CPU.
|
||||
|
||||
Example script:
|
||||
|
||||
```bash
|
||||
python train.py --config=configs/ljspeech.yaml --data=./LJSpeech-1.1/ --output=experiment --device=0
|
||||
python train.py \
|
||||
--config=configs/ljspeech.yaml \
|
||||
--data=./LJSpeech-1.1/ \
|
||||
--device=0 \
|
||||
experiment
|
||||
```
|
||||
|
||||
You can monitor training log via tensorboard, using the script below.
|
||||
|
@ -84,31 +109,50 @@ tensorboard --logdir=.
|
|||
|
||||
## Synthesis
|
||||
```text
|
||||
usage: synthesis.py [-h] [-c CONFIG] [-g DEVICE] checkpoint text output_path
|
||||
usage: synthesis.py [-h] [--config CONFIG] [--device DEVICE]
|
||||
[--checkpoint CHECKPOINT | --iteration ITERATION]
|
||||
text output
|
||||
|
||||
Synthsize waveform from a checkpoint.
|
||||
Synthsize waveform with a checkpoint.
|
||||
|
||||
positional arguments:
|
||||
checkpoint checkpoint to load.
|
||||
text text file to synthesize
|
||||
output_path path to save results
|
||||
text text file to synthesize
|
||||
output path to save synthesized audio
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-c CONFIG, --config CONFIG
|
||||
experiment config.
|
||||
-g DEVICE, --device DEVICE
|
||||
device to use
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG experiment config
|
||||
--device DEVICE device to use
|
||||
--checkpoint CHECKPOINT checkpoint to resume from
|
||||
--iteration ITERATION the iteration of the checkpoint to load from output directory
|
||||
```
|
||||
|
||||
- `--config` is the configuration file to use. You should use the same configuration with which you train you model.
|
||||
- `checkpoint` is the checkpoint to load.
|
||||
- `text`is the text file to synthesize.
|
||||
- `output_path` is the directory to save results. The output path contains the generated audio files (`*.wav`) and attention plots (*.png) for each sentence.
|
||||
- `--device` is the device (gpu id) to use for training. `-1` means CPU.
|
||||
|
||||
- `--checkpoint` is the path of the checkpoint.
|
||||
- `--iteration` is the iteration of the checkpoint to load from output directory.
|
||||
See [Saving-&-Loading](#Saving-&-Loading) for details of checkpoint loading.
|
||||
|
||||
- `text`is the text file to synthesize.
|
||||
- `output` is the directory to save results. The generated audio files (`*.wav`) and attention plots (*.png) for are save in `synthesis/` in ouput directory.
|
||||
|
||||
Example script:
|
||||
|
||||
```bash
|
||||
python synthesis.py --config=configs/ljspeech.yaml --device=0 experiment/checkpoints/model_step_005000000 sentences.txt generated
|
||||
python synthesis.py \
|
||||
--config=configs/ljspeech.yaml \
|
||||
--device=0 \
|
||||
--checkpoint="experiment/checkpoints/model_step_005000000" \
|
||||
sentences.txt experiment
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```bash
|
||||
python synthesis.py \
|
||||
--config=configs/ljspeech.yaml \
|
||||
--device=0 \
|
||||
--iteration=005000000 \
|
||||
sentences.txt experiment
|
||||
```
|
||||
|
|
|
@ -83,7 +83,7 @@ lr_scheduler:
|
|||
|
||||
train:
|
||||
batch_size: 16
|
||||
epochs: 2000
|
||||
max_iteration: 2000000
|
||||
|
||||
snap_interval: 1000
|
||||
eval_interval: 10000
|
||||
|
|
|
@ -189,11 +189,14 @@ class DataCollector(object):
|
|||
# text positions
|
||||
text_mask = (np.arange(1, 1 + max_text_length) <= np.expand_dims(
|
||||
text_lengths, -1)).astype(np.int64)
|
||||
text_positions = np.arange(1, 1 + max_text_length) * text_mask
|
||||
text_positions = np.arange(
|
||||
1, 1 + max_text_length, dtype=np.int64) * text_mask
|
||||
|
||||
# decoder_positions
|
||||
decoder_positions = np.tile(
|
||||
np.expand_dims(np.arange(1, 1 + max_decoder_length), 0),
|
||||
np.expand_dims(
|
||||
np.arange(
|
||||
1, 1 + max_decoder_length, dtype=np.int64), 0),
|
||||
(batch_size, 1))
|
||||
|
||||
return (text_sequences, text_lengths, text_positions, mel_specs,
|
||||
|
|
|
@ -25,25 +25,37 @@ import paddle.fluid.dygraph as dg
|
|||
from tensorboardX import SummaryWriter
|
||||
|
||||
from parakeet.g2p import en
|
||||
from parakeet.utils.layer_tools import summary
|
||||
from parakeet.modules.weight_norm import WeightNormWrapper
|
||||
from parakeet.utils.layer_tools import summary
|
||||
from parakeet.utils import io
|
||||
|
||||
from utils import make_model, eval_model, plot_alignment
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Synthsize waveform with a checkpoint.")
|
||||
parser.add_argument("-c", "--config", type=str, help="experiment config.")
|
||||
parser.add_argument("checkpoint", type=str, help="checkpoint to load.")
|
||||
parser.add_argument("--config", type=str, help="experiment config")
|
||||
parser.add_argument("--device", type=int, default=-1, help="device to use")
|
||||
|
||||
g = parser.add_mutually_exclusive_group()
|
||||
g.add_argument("--checkpoint", type=str, help="checkpoint to resume from")
|
||||
g.add_argument(
|
||||
"--iteration",
|
||||
type=int,
|
||||
help="the iteration of the checkpoint to load from output directory")
|
||||
|
||||
parser.add_argument("text", type=str, help="text file to synthesize")
|
||||
parser.add_argument("output_path", type=str, help="path to save results")
|
||||
parser.add_argument(
|
||||
"-g", "--device", type=int, default=-1, help="device to use")
|
||||
"output", type=str, help="path to save synthesized audio")
|
||||
|
||||
args = parser.parse_args()
|
||||
with open(args.config, 'rt') as f:
|
||||
config = ruamel.yaml.safe_load(f)
|
||||
|
||||
print("Command Line Args: ")
|
||||
for k, v in vars(args).items():
|
||||
print("{}: {}".format(k, v))
|
||||
|
||||
if args.device == -1:
|
||||
place = fluid.CPUPlace()
|
||||
else:
|
||||
|
@ -98,16 +110,21 @@ if __name__ == "__main__":
|
|||
linear_dim, use_decoder_states, converter_channels, dropout)
|
||||
|
||||
summary(dv3)
|
||||
state, _ = dg.load_dygraph(args.checkpoint)
|
||||
dv3.set_dict(state)
|
||||
|
||||
checkpoint_dir = os.path.join(args.output, "checkpoints")
|
||||
if args.checkpoint is not None:
|
||||
iteration = io.load_parameters(
|
||||
dv3, checkpoint_path=args.checkpoint)
|
||||
else:
|
||||
iteration = io.load_parameters(
|
||||
dv3, checkpoint_dir=checkpoint_dir, iteration=args.iteration)
|
||||
|
||||
# WARNING: don't forget to remove weight norm to re-compute each wrapped layer's weight
|
||||
# removing weight norm also speeds up computation
|
||||
for layer in dv3.sublayers():
|
||||
if isinstance(layer, WeightNormWrapper):
|
||||
layer.remove_weight_norm()
|
||||
|
||||
if not os.path.exists(args.output_path):
|
||||
os.makedirs(args.output_path)
|
||||
|
||||
transform_config = config["transform"]
|
||||
c = transform_config["replace_pronunciation_prob"]
|
||||
sample_rate = transform_config["sample_rate"]
|
||||
|
@ -121,6 +138,10 @@ if __name__ == "__main__":
|
|||
power = synthesis_config["power"]
|
||||
n_iter = synthesis_config["n_iter"]
|
||||
|
||||
synthesis_dir = os.path.join(args.output, "synthesis")
|
||||
if not os.path.exists(synthesis_dir):
|
||||
os.makedirs(synthesis_dir)
|
||||
|
||||
with open(args.text, "rt", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
for idx, line in enumerate(lines):
|
||||
|
@ -132,7 +153,9 @@ if __name__ == "__main__":
|
|||
preemphasis)
|
||||
plot_alignment(
|
||||
attn,
|
||||
os.path.join(args.output_path, "test_{}.png".format(idx)))
|
||||
os.path.join(synthesis_dir,
|
||||
"test_{}_step_{}.png".format(idx, iteration)))
|
||||
sf.write(
|
||||
os.path.join(args.output_path, "test_{}.wav".format(idx)),
|
||||
os.path.join(synthesis_dir,
|
||||
"test_{}_step{}.wav".format(idx, iteration)),
|
||||
wav, sample_rate)
|
||||
|
|
|
@ -17,6 +17,8 @@ import os
|
|||
import argparse
|
||||
import ruamel.yaml
|
||||
import numpy as np
|
||||
import matplotlib
|
||||
matplotlib.use("agg")
|
||||
from matplotlib import cm
|
||||
import matplotlib.pyplot as plt
|
||||
import tqdm
|
||||
|
@ -35,33 +37,40 @@ from parakeet.data import DataCargo, PartialyRandomizedSimilarTimeLengthSampler,
|
|||
from parakeet.models.deepvoice3 import Encoder, Decoder, Converter, DeepVoice3, ConvSpec
|
||||
from parakeet.models.deepvoice3.loss import TTSLoss
|
||||
from parakeet.utils.layer_tools import summary
|
||||
from parakeet.utils import io
|
||||
|
||||
from data import LJSpeechMetaData, DataCollector, Transform
|
||||
from utils import make_model, eval_model, save_state, make_output_tree, plot_alignment
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Train a deepvoice 3 model with LJSpeech dataset.")
|
||||
parser.add_argument("-c", "--config", type=str, help="experimrnt config")
|
||||
description="Train a Deep Voice 3 model with LJSpeech dataset.")
|
||||
parser.add_argument("--config", type=str, help="experimrnt config")
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--data",
|
||||
type=str,
|
||||
default="/workspace/datasets/LJSpeech-1.1/",
|
||||
help="The path of the LJSpeech dataset.")
|
||||
parser.add_argument("-r", "--resume", type=str, help="checkpoint to load")
|
||||
parser.add_argument("--device", type=int, default=-1, help="device to use")
|
||||
|
||||
g = parser.add_mutually_exclusive_group()
|
||||
g.add_argument("--checkpoint", type=str, help="checkpoint to resume from.")
|
||||
g.add_argument(
|
||||
"--iteration",
|
||||
type=int,
|
||||
help="the iteration of the checkpoint to load from output directory")
|
||||
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
type=str,
|
||||
default="result",
|
||||
help="The directory to save result.")
|
||||
parser.add_argument(
|
||||
"-g", "--device", type=int, default=-1, help="device to use")
|
||||
"output", type=str, default="experiment", help="path to save results")
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
with open(args.config, 'rt') as f:
|
||||
config = ruamel.yaml.safe_load(f)
|
||||
|
||||
print("Command Line Args: ")
|
||||
for k, v in vars(args).items():
|
||||
print("{}: {}".format(k, v))
|
||||
|
||||
# =========================dataset=========================
|
||||
# construct meta data
|
||||
data_root = args.data
|
||||
|
@ -151,6 +160,7 @@ if __name__ == "__main__":
|
|||
query_position_rate, key_position_rate, window_backward,
|
||||
window_ahead, key_projection, value_projection, downsample_factor,
|
||||
linear_dim, use_decoder_states, converter_channels, dropout)
|
||||
summary(dv3)
|
||||
|
||||
# =========================loss=========================
|
||||
loss_config = config["loss"]
|
||||
|
@ -195,7 +205,6 @@ if __name__ == "__main__":
|
|||
n_iter = synthesis_config["n_iter"]
|
||||
|
||||
# =========================link(dataloader, paddle)=========================
|
||||
# CAUTION: it does not return a DataLoader
|
||||
loader = fluid.io.DataLoader.from_generator(
|
||||
capacity=10, return_list=True)
|
||||
loader.set_batch_generator(ljspeech_loader, places=place)
|
||||
|
@ -208,122 +217,117 @@ if __name__ == "__main__":
|
|||
make_output_tree(output_dir)
|
||||
writer = SummaryWriter(logdir=log_dir)
|
||||
|
||||
# load model parameters
|
||||
resume_path = args.resume
|
||||
if resume_path is not None:
|
||||
state, _ = dg.load_dygraph(args.resume)
|
||||
dv3.set_dict(state)
|
||||
# load parameters and optimizer, and opdate iterations done sofar
|
||||
if args.checkpoint is not None:
|
||||
iteration = io.load_parameters(
|
||||
dv3, optim, checkpoint_path=args.checkpoint)
|
||||
else:
|
||||
iteration = io.load_parameters(
|
||||
dv3, optim, checkpoint_dir=ckpt_dir, iteration=args.iteration)
|
||||
|
||||
# =========================train=========================
|
||||
epoch = train_config["epochs"]
|
||||
max_iter = train_config["max_iteration"]
|
||||
snap_interval = train_config["snap_interval"]
|
||||
save_interval = train_config["save_interval"]
|
||||
eval_interval = train_config["eval_interval"]
|
||||
|
||||
global_step = 1
|
||||
global_step = iteration + 1
|
||||
iterator = iter(tqdm.tqdm(loader))
|
||||
while global_step <= max_iter:
|
||||
try:
|
||||
batch = next(iterator)
|
||||
except StopIteration as e:
|
||||
iterator = iter(tqdm.tqdm(loader))
|
||||
batch = next(iterator)
|
||||
|
||||
for j in range(1, 1 + epoch):
|
||||
epoch_loss = 0.
|
||||
for i, batch in tqdm.tqdm(enumerate(loader, 1)):
|
||||
dv3.train() # CAUTION: don't forget to switch to train
|
||||
(text_sequences, text_lengths, text_positions, mel_specs,
|
||||
lin_specs, frames, decoder_positions, done_flags) = batch
|
||||
downsampled_mel_specs = F.strided_slice(
|
||||
mel_specs,
|
||||
axes=[1],
|
||||
starts=[0],
|
||||
ends=[mel_specs.shape[1]],
|
||||
strides=[downsample_factor])
|
||||
mel_outputs, linear_outputs, alignments, done = dv3(
|
||||
text_sequences, text_positions, text_lengths, None,
|
||||
downsampled_mel_specs, decoder_positions)
|
||||
dv3.train()
|
||||
(text_sequences, text_lengths, text_positions, mel_specs,
|
||||
lin_specs, frames, decoder_positions, done_flags) = batch
|
||||
downsampled_mel_specs = F.strided_slice(
|
||||
mel_specs,
|
||||
axes=[1],
|
||||
starts=[0],
|
||||
ends=[mel_specs.shape[1]],
|
||||
strides=[downsample_factor])
|
||||
mel_outputs, linear_outputs, alignments, done = dv3(
|
||||
text_sequences, text_positions, text_lengths, None,
|
||||
downsampled_mel_specs, decoder_positions)
|
||||
|
||||
losses = criterion(mel_outputs, linear_outputs, done,
|
||||
alignments, downsampled_mel_specs,
|
||||
lin_specs, done_flags, text_lengths, frames)
|
||||
l = losses["loss"]
|
||||
l.backward()
|
||||
# record learning rate before updating
|
||||
writer.add_scalar("learning_rate",
|
||||
optim._learning_rate.step().numpy(),
|
||||
global_step)
|
||||
optim.minimize(l, grad_clip=gradient_clipper)
|
||||
optim.clear_gradients()
|
||||
losses = criterion(mel_outputs, linear_outputs, done, alignments,
|
||||
downsampled_mel_specs, lin_specs, done_flags,
|
||||
text_lengths, frames)
|
||||
l = losses["loss"]
|
||||
l.backward()
|
||||
# record learning rate before updating
|
||||
writer.add_scalar("learning_rate",
|
||||
optim._learning_rate.step().numpy(), global_step)
|
||||
optim.minimize(l, grad_clip=gradient_clipper)
|
||||
optim.clear_gradients()
|
||||
|
||||
# ==================all kinds of tedious things=================
|
||||
# record step loss into tensorboard
|
||||
epoch_loss += l.numpy()[0]
|
||||
step_loss = {k: v.numpy()[0] for k, v in losses.items()}
|
||||
for k, v in step_loss.items():
|
||||
writer.add_scalar(k, v, global_step)
|
||||
# ==================all kinds of tedious things=================
|
||||
# record step loss into tensorboard
|
||||
step_loss = {k: v.numpy()[0] for k, v in losses.items()}
|
||||
tqdm.tqdm.write("global_step: {}\tloss: {}".format(
|
||||
global_step, step_loss["loss"]))
|
||||
for k, v in step_loss.items():
|
||||
writer.add_scalar(k, v, global_step)
|
||||
|
||||
# TODO: clean code
|
||||
# train state saving, the first sentence in the batch
|
||||
if global_step % snap_interval == 0:
|
||||
save_state(
|
||||
state_dir,
|
||||
writer,
|
||||
# train state saving, the first sentence in the batch
|
||||
if global_step % snap_interval == 0:
|
||||
save_state(
|
||||
state_dir,
|
||||
writer,
|
||||
global_step,
|
||||
mel_input=downsampled_mel_specs,
|
||||
mel_output=mel_outputs,
|
||||
lin_input=lin_specs,
|
||||
lin_output=linear_outputs,
|
||||
alignments=alignments,
|
||||
win_length=win_length,
|
||||
hop_length=hop_length,
|
||||
min_level_db=min_level_db,
|
||||
ref_level_db=ref_level_db,
|
||||
power=power,
|
||||
n_iter=n_iter,
|
||||
preemphasis=preemphasis,
|
||||
sample_rate=sample_rate)
|
||||
|
||||
# evaluation
|
||||
if global_step % eval_interval == 0:
|
||||
sentences = [
|
||||
"Scientists at the CERN laboratory say they have discovered a new particle.",
|
||||
"There's a way to measure the acute emotional intelligence that has never gone out of style.",
|
||||
"President Trump met with other leaders at the Group of 20 conference.",
|
||||
"Generative adversarial network or variational auto-encoder.",
|
||||
"Please call Stella.",
|
||||
"Some have accepted this as a miracle without any physical explanation.",
|
||||
]
|
||||
for idx, sent in enumerate(sentences):
|
||||
wav, attn = eval_model(
|
||||
dv3, sent, replace_pronounciation_prob, min_level_db,
|
||||
ref_level_db, power, n_iter, win_length, hop_length,
|
||||
preemphasis)
|
||||
wav_path = os.path.join(
|
||||
state_dir, "waveform",
|
||||
"eval_sample_{:09d}.wav".format(global_step))
|
||||
sf.write(wav_path, wav, sample_rate)
|
||||
writer.add_audio(
|
||||
"eval_sample_{}".format(idx),
|
||||
wav,
|
||||
global_step,
|
||||
mel_input=downsampled_mel_specs,
|
||||
mel_output=mel_outputs,
|
||||
lin_input=lin_specs,
|
||||
lin_output=linear_outputs,
|
||||
alignments=alignments,
|
||||
win_length=win_length,
|
||||
hop_length=hop_length,
|
||||
min_level_db=min_level_db,
|
||||
ref_level_db=ref_level_db,
|
||||
power=power,
|
||||
n_iter=n_iter,
|
||||
preemphasis=preemphasis,
|
||||
sample_rate=sample_rate)
|
||||
attn_path = os.path.join(
|
||||
state_dir, "alignments",
|
||||
"eval_sample_attn_{:09d}.png".format(global_step))
|
||||
plot_alignment(attn, attn_path)
|
||||
writer.add_image(
|
||||
"eval_sample_attn{}".format(idx),
|
||||
cm.viridis(attn),
|
||||
global_step,
|
||||
dataformats="HWC")
|
||||
|
||||
# evaluation
|
||||
if global_step % eval_interval == 0:
|
||||
sentences = [
|
||||
"Scientists at the CERN laboratory say they have discovered a new particle.",
|
||||
"There's a way to measure the acute emotional intelligence that has never gone out of style.",
|
||||
"President Trump met with other leaders at the Group of 20 conference.",
|
||||
"Generative adversarial network or variational auto-encoder.",
|
||||
"Please call Stella.",
|
||||
"Some have accepted this as a miracle without any physical explanation.",
|
||||
]
|
||||
for idx, sent in enumerate(sentences):
|
||||
wav, attn = eval_model(
|
||||
dv3, sent, replace_pronounciation_prob,
|
||||
min_level_db, ref_level_db, power, n_iter,
|
||||
win_length, hop_length, preemphasis)
|
||||
wav_path = os.path.join(
|
||||
state_dir, "waveform",
|
||||
"eval_sample_{:09d}.wav".format(global_step))
|
||||
sf.write(wav_path, wav, sample_rate)
|
||||
writer.add_audio(
|
||||
"eval_sample_{}".format(idx),
|
||||
wav,
|
||||
global_step,
|
||||
sample_rate=sample_rate)
|
||||
attn_path = os.path.join(
|
||||
state_dir, "alignments",
|
||||
"eval_sample_attn_{:09d}.png".format(global_step))
|
||||
plot_alignment(attn, attn_path)
|
||||
writer.add_image(
|
||||
"eval_sample_attn{}".format(idx),
|
||||
cm.viridis(attn),
|
||||
global_step,
|
||||
dataformats="HWC")
|
||||
# save checkpoint
|
||||
if global_step % save_interval == 0:
|
||||
io.save_parameters(ckpt_dir, global_step, dv3, optim)
|
||||
|
||||
# save checkpoint
|
||||
if global_step % save_interval == 0:
|
||||
dg.save_dygraph(
|
||||
dv3.state_dict(),
|
||||
os.path.join(ckpt_dir,
|
||||
"model_step_{}".format(global_step)))
|
||||
dg.save_dygraph(
|
||||
optim.state_dict(),
|
||||
os.path.join(ckpt_dir,
|
||||
"model_step_{}".format(global_step)))
|
||||
|
||||
global_step += 1
|
||||
# epoch report
|
||||
writer.add_scalar("epoch_average_loss", epoch_loss / i, j)
|
||||
epoch_loss = 0.
|
||||
global_step += 1
|
||||
|
|
|
@ -13,8 +13,8 @@ PaddlePaddle dynamic graph implementation of [WaveFlow: A Compact Flow-based Mod
|
|||
├── synthesis.py # script for speech synthesis
|
||||
├── train.py # script for model training
|
||||
├── utils.py # helper functions for e.g., model checkpointing
|
||||
├── parakeet/models/waveflow/data.py # dataset and dataloader settings for LJSpeech
|
||||
├── parakeet/models/waveflow/waveflow.py # WaveFlow model high level APIs
|
||||
├── data.py # dataset and dataloader settings for LJSpeech
|
||||
├── waveflow.py # WaveFlow model high level APIs
|
||||
└── parakeet/models/waveflow/waveflow_modules.py # WaveFlow model implementation
|
||||
```
|
||||
|
||||
|
@ -48,12 +48,12 @@ python -u train.py \
|
|||
--config=./configs/waveflow_ljspeech.yaml \
|
||||
--root=./data/LJSpeech-1.1 \
|
||||
--name=${ModelName} --batch_size=4 \
|
||||
--parallel=false --use_gpu=true
|
||||
--use_gpu=true
|
||||
```
|
||||
|
||||
#### Save and Load checkpoints
|
||||
|
||||
Our model will save model parameters as checkpoints in `./runs/waveflow/${ModelName}/checkpoint/` every 10000 iterations by default.
|
||||
Our model will save model parameters as checkpoints in `./runs/waveflow/${ModelName}/checkpoint/` every 10000 iterations by default, where `${ModelName}` is the model name for one single experiment and it could be whatever you like.
|
||||
The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters.
|
||||
|
||||
There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint):
|
||||
|
@ -68,7 +68,7 @@ export CUDA_VISIBLE_DEVICES=0,1,2,3
|
|||
python -u -m paddle.distributed.launch train.py \
|
||||
--config=./configs/waveflow_ljspeech.yaml \
|
||||
--root=./data/LJSpeech-1.1 \
|
||||
--name=${ModelName} --parallel=true --use_gpu=true
|
||||
--name=${ModelName} --use_gpu=true
|
||||
```
|
||||
|
||||
Use `export CUDA_VISIBLE_DEVICES=0,1,2,3` to set the GPUs that you want to use to be visible. Then the `paddle.distributed.launch` module will use these visible GPUs to do data parallel training in multiprocessing mode.
|
||||
|
|
|
@ -22,7 +22,8 @@ import paddle.fluid.dygraph as dg
|
|||
from paddle import fluid
|
||||
|
||||
import utils
|
||||
from parakeet.models.waveflow import WaveFlow
|
||||
from parakeet.utils import io
|
||||
from waveflow import WaveFlow
|
||||
|
||||
|
||||
def add_options_to_parser(parser):
|
||||
|
@ -98,5 +99,5 @@ if __name__ == "__main__":
|
|||
# For conflicting updates to the same field,
|
||||
# the preceding update will be overwritten by the following one.
|
||||
config = parser.parse_args()
|
||||
config = utils.add_yaml_config(config)
|
||||
config = io.add_yaml_config_to_args(config)
|
||||
benchmark(config)
|
||||
|
|
|
@ -21,8 +21,9 @@ import numpy as np
|
|||
import paddle.fluid.dygraph as dg
|
||||
from paddle import fluid
|
||||
|
||||
from parakeet.utils import io
|
||||
import utils
|
||||
from parakeet.models.waveflow import WaveFlow
|
||||
from waveflow import WaveFlow
|
||||
|
||||
|
||||
def add_options_to_parser(parser):
|
||||
|
@ -96,7 +97,7 @@ def synthesize(config):
|
|||
# Obtain the current iteration.
|
||||
if config.checkpoint is None:
|
||||
if config.iteration is None:
|
||||
iteration = utils.load_latest_checkpoint(checkpoint_dir)
|
||||
iteration = io.load_latest_checkpoint(checkpoint_dir)
|
||||
else:
|
||||
iteration = config.iteration
|
||||
else:
|
||||
|
@ -117,5 +118,5 @@ if __name__ == "__main__":
|
|||
# For conflicting updates to the same field,
|
||||
# the preceding update will be overwritten by the following one.
|
||||
config = parser.parse_args()
|
||||
config = utils.add_yaml_config(config)
|
||||
config = io.add_yaml_config_to_args(config)
|
||||
synthesize(config)
|
||||
|
|
|
@ -25,7 +25,8 @@ from paddle import fluid
|
|||
from tensorboardX import SummaryWriter
|
||||
|
||||
import utils
|
||||
from parakeet.models.waveflow import WaveFlow
|
||||
from parakeet.utils import io
|
||||
from waveflow import WaveFlow
|
||||
|
||||
|
||||
def add_options_to_parser(parser):
|
||||
|
@ -39,11 +40,6 @@ def add_options_to_parser(parser):
|
|||
parser.add_argument(
|
||||
'--root', type=str, help="root path of the LJSpeech dataset")
|
||||
|
||||
parser.add_argument(
|
||||
'--parallel',
|
||||
type=utils.str2bool,
|
||||
default=True,
|
||||
help="option to use data parallel training")
|
||||
parser.add_argument(
|
||||
'--use_gpu',
|
||||
type=utils.str2bool,
|
||||
|
@ -65,11 +61,11 @@ def add_options_to_parser(parser):
|
|||
|
||||
def train(config):
|
||||
use_gpu = config.use_gpu
|
||||
parallel = config.parallel if use_gpu else False
|
||||
|
||||
# Get the rank of the current training process.
|
||||
rank = dg.parallel.Env().local_rank if parallel else 0
|
||||
nranks = dg.parallel.Env().nranks if parallel else 1
|
||||
rank = dg.parallel.Env().local_rank
|
||||
nranks = dg.parallel.Env().nranks
|
||||
parallel = nranks > 1
|
||||
|
||||
if rank == 0:
|
||||
# Print the whole config setting.
|
||||
|
@ -99,16 +95,7 @@ def train(config):
|
|||
|
||||
# Build model.
|
||||
model = WaveFlow(config, checkpoint_dir, parallel, rank, nranks, tb)
|
||||
model.build()
|
||||
|
||||
# Obtain the current iteration.
|
||||
if config.checkpoint is None:
|
||||
if config.iteration is None:
|
||||
iteration = utils.load_latest_checkpoint(checkpoint_dir, rank)
|
||||
else:
|
||||
iteration = config.iteration
|
||||
else:
|
||||
iteration = int(config.checkpoint.split('/')[-1].split('-')[-1])
|
||||
iteration = model.build()
|
||||
|
||||
while iteration < config.max_iterations:
|
||||
# Run one single training step.
|
||||
|
@ -140,7 +127,7 @@ if __name__ == "__main__":
|
|||
# For conflicting updates to the same field,
|
||||
# the preceding update will be overwritten by the following one.
|
||||
config = parser.parse_args()
|
||||
config = utils.add_yaml_config(config)
|
||||
config = io.add_yaml_config_to_args(config)
|
||||
# Force to use fp32 in model training
|
||||
vars(config)["use_fp16"] = False
|
||||
train(config)
|
||||
|
|
|
@ -12,14 +12,7 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import itertools
|
||||
import os
|
||||
import time
|
||||
|
||||
import argparse
|
||||
import ruamel.yaml
|
||||
import numpy as np
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
||||
|
||||
def str2bool(v):
|
||||
|
@ -95,131 +88,3 @@ def add_config_options_to_parser(parser):
|
|||
'--kernel_w', type=int, help="width of the kernel in the conv2d layer")
|
||||
|
||||
parser.add_argument('--config', type=str, help="Path to the config file.")
|
||||
|
||||
|
||||
def add_yaml_config(config):
|
||||
with open(config.config, 'rt') as f:
|
||||
yaml_cfg = ruamel.yaml.safe_load(f)
|
||||
cfg_vars = vars(config)
|
||||
for k, v in yaml_cfg.items():
|
||||
if k in cfg_vars and cfg_vars[k] is not None:
|
||||
continue
|
||||
cfg_vars[k] = v
|
||||
return config
|
||||
|
||||
|
||||
def load_latest_checkpoint(checkpoint_dir, rank=0):
|
||||
"""Get the iteration number corresponding to the latest saved checkpoint
|
||||
|
||||
Args:
|
||||
checkpoint_dir (str): the directory where checkpoint is saved.
|
||||
rank (int, optional): the rank of the process in multi-process setting.
|
||||
Defaults to 0.
|
||||
|
||||
Returns:
|
||||
int: the latest iteration number.
|
||||
"""
|
||||
checkpoint_path = os.path.join(checkpoint_dir, "checkpoint")
|
||||
# Create checkpoint index file if not exist.
|
||||
if (not os.path.isfile(checkpoint_path)) and rank == 0:
|
||||
with open(checkpoint_path, "w") as handle:
|
||||
handle.write("model_checkpoint_path: step-0")
|
||||
|
||||
# Make sure that other process waits until checkpoint file is created
|
||||
# by process 0.
|
||||
while not os.path.isfile(checkpoint_path):
|
||||
time.sleep(1)
|
||||
|
||||
# Fetch the latest checkpoint index.
|
||||
with open(checkpoint_path, "r") as handle:
|
||||
latest_checkpoint = handle.readline().split()[-1]
|
||||
iteration = int(latest_checkpoint.split("-")[-1])
|
||||
|
||||
return iteration
|
||||
|
||||
|
||||
def save_latest_checkpoint(checkpoint_dir, iteration):
|
||||
"""Save the iteration number of the latest model to be checkpointed.
|
||||
|
||||
Args:
|
||||
checkpoint_dir (str): the directory where checkpoint is saved.
|
||||
iteration (int): the latest iteration number.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
checkpoint_path = os.path.join(checkpoint_dir, "checkpoint")
|
||||
# Update the latest checkpoint index.
|
||||
with open(checkpoint_path, "w") as handle:
|
||||
handle.write("model_checkpoint_path: step-{}".format(iteration))
|
||||
|
||||
|
||||
def load_parameters(checkpoint_dir,
|
||||
rank,
|
||||
model,
|
||||
optimizer=None,
|
||||
iteration=None,
|
||||
file_path=None,
|
||||
dtype="float32"):
|
||||
"""Load a specific model checkpoint from disk.
|
||||
|
||||
Args:
|
||||
checkpoint_dir (str): the directory where checkpoint is saved.
|
||||
rank (int): the rank of the process in multi-process setting.
|
||||
model (obj): model to load parameters.
|
||||
optimizer (obj, optional): optimizer to load states if needed.
|
||||
Defaults to None.
|
||||
iteration (int, optional): if specified, load the specific checkpoint,
|
||||
if not specified, load the latest one. Defaults to None.
|
||||
file_path (str, optional): if specified, load the checkpoint
|
||||
stored in the file_path. Defaults to None.
|
||||
dtype (str, optional): precision of the model parameters.
|
||||
Defaults to float32.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if file_path is None:
|
||||
if iteration is None:
|
||||
iteration = load_latest_checkpoint(checkpoint_dir, rank)
|
||||
if iteration == 0:
|
||||
return
|
||||
file_path = "{}/step-{}".format(checkpoint_dir, iteration)
|
||||
|
||||
model_dict, optimizer_dict = dg.load_dygraph(file_path)
|
||||
if dtype == "float16":
|
||||
for k, v in model_dict.items():
|
||||
if "conv2d_transpose" in k:
|
||||
model_dict[k] = v.astype("float32")
|
||||
else:
|
||||
model_dict[k] = v.astype(dtype)
|
||||
model.set_dict(model_dict)
|
||||
print("[checkpoint] Rank {}: loaded model from {}".format(rank, file_path))
|
||||
if optimizer and optimizer_dict:
|
||||
optimizer.set_dict(optimizer_dict)
|
||||
print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
|
||||
rank, file_path))
|
||||
|
||||
|
||||
def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None):
|
||||
"""Checkpoint the latest trained model parameters.
|
||||
|
||||
Args:
|
||||
checkpoint_dir (str): the directory where checkpoint is saved.
|
||||
iteration (int): the latest iteration number.
|
||||
model (obj): model to be checkpointed.
|
||||
optimizer (obj, optional): optimizer to be checkpointed.
|
||||
Defaults to None.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
file_path = "{}/step-{}".format(checkpoint_dir, iteration)
|
||||
model_dict = model.state_dict()
|
||||
dg.save_dygraph(model_dict, file_path)
|
||||
print("[checkpoint] Saved model to {}".format(file_path))
|
||||
|
||||
if optimizer:
|
||||
opt_dict = optimizer.state_dict()
|
||||
dg.save_dygraph(opt_dict, file_path)
|
||||
print("[checkpoint] Saved optimzier state to {}".format(file_path))
|
||||
|
|
|
@ -21,10 +21,11 @@ import paddle.fluid.dygraph as dg
|
|||
from paddle import fluid
|
||||
from scipy.io.wavfile import write
|
||||
|
||||
import utils
|
||||
from parakeet.utils import io
|
||||
from parakeet.modules import weight_norm
|
||||
from .data import LJSpeech
|
||||
from .waveflow_modules import WaveFlowLoss, WaveFlowModule
|
||||
from parakeet.models.waveflow import WaveFlowLoss, WaveFlowModule
|
||||
from data import LJSpeech
|
||||
import utils
|
||||
|
||||
|
||||
class WaveFlow():
|
||||
|
@ -47,6 +48,7 @@ class WaveFlow():
|
|||
Returns:
|
||||
WaveFlow
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
config,
|
||||
checkpoint_dir,
|
||||
|
@ -91,13 +93,12 @@ class WaveFlow():
|
|||
parameter_list=waveflow.parameters())
|
||||
|
||||
# Load parameters.
|
||||
utils.load_parameters(
|
||||
self.checkpoint_dir,
|
||||
self.rank,
|
||||
waveflow,
|
||||
optimizer,
|
||||
iteration = io.load_parameters(
|
||||
model=waveflow,
|
||||
optimizer=optimizer,
|
||||
checkpoint_dir=self.checkpoint_dir,
|
||||
iteration=config.iteration,
|
||||
file_path=config.checkpoint)
|
||||
checkpoint_path=config.checkpoint)
|
||||
print("Rank {}: checkpoint loaded.".format(self.rank))
|
||||
|
||||
# Data parallelism.
|
||||
|
@ -111,13 +112,11 @@ class WaveFlow():
|
|||
|
||||
else:
|
||||
# Load parameters.
|
||||
utils.load_parameters(
|
||||
self.checkpoint_dir,
|
||||
self.rank,
|
||||
waveflow,
|
||||
iteration = io.load_parameters(
|
||||
model=waveflow,
|
||||
checkpoint_dir=self.checkpoint_dir,
|
||||
iteration=config.iteration,
|
||||
file_path=config.checkpoint,
|
||||
dtype=self.dtype)
|
||||
checkpoint_path=config.checkpoint)
|
||||
print("Rank {}: checkpoint loaded.".format(self.rank))
|
||||
|
||||
for layer in waveflow.sublayers():
|
||||
|
@ -126,6 +125,8 @@ class WaveFlow():
|
|||
|
||||
self.waveflow = waveflow
|
||||
|
||||
return iteration
|
||||
|
||||
def train_step(self, iteration):
|
||||
"""Train the model for one step.
|
||||
|
||||
|
@ -291,6 +292,5 @@ class WaveFlow():
|
|||
Returns:
|
||||
None
|
||||
"""
|
||||
utils.save_latest_parameters(self.checkpoint_dir, iteration,
|
||||
self.waveflow, self.optimizer)
|
||||
utils.save_latest_checkpoint(self.checkpoint_dir, iteration)
|
||||
io.save_parameters(self.checkpoint_dir, iteration, self.waveflow,
|
||||
self.optimizer)
|
|
@ -22,41 +22,67 @@ tar xjvf LJSpeech-1.1.tar.bz2
|
|||
└── utils.py utility functions
|
||||
```
|
||||
|
||||
## Saving & Loading
|
||||
`train.py` and `synthesis.py` have 3 arguments in common, `--checkpooint`, `iteration` and `output`.
|
||||
|
||||
1. `output` is the directory for saving results.
|
||||
During training, checkpoints are saved in `checkpoints/` in `output` and tensorboard log is save in `log/` in `output`. Other possible outputs are saved in `states/` in `outuput`.
|
||||
During synthesizing, audio files and other possible outputs are save in `synthesis/` in `output`.
|
||||
So after training and synthesizing with the same output directory, the file structure of the output directory looks like this.
|
||||
|
||||
```text
|
||||
├── checkpoints/ # checkpoint directory (including *.pdparams, *.pdopt and a text file `checkpoint` that records the latest checkpoint)
|
||||
├── states/ # audio files generated at validation and other possible outputs
|
||||
├── log/ # tensorboard log
|
||||
└── synthesis/ # synthesized audio files and other possible outputs
|
||||
```
|
||||
|
||||
2. `--checkpoint` and `--iteration` for loading from existing checkpoint. Loading existing checkpoiont follows the following rule:
|
||||
If `--checkpoint` is provided, the checkpoint specified by `--checkpoint` is loaded.
|
||||
If `--checkpoint` is not provided, we try to load the model specified by `--iteration` from the checkpoint directory. If `--iteration` is not provided, we try to load the latested checkpoint from checkpoint directory.
|
||||
|
||||
## Train
|
||||
|
||||
Train the model using train.py. For help on usage, try `python train.py --help`.
|
||||
|
||||
```text
|
||||
usage: train.py [-h] [--data DATA] [--config CONFIG] [--output OUTPUT]
|
||||
[--device DEVICE] [--resume RESUME]
|
||||
usage: train.py [-h] [--data DATA] [--config CONFIG] [--device DEVICE]
|
||||
[--checkpoint CHECKPOINT | --iteration ITERATION]
|
||||
output
|
||||
|
||||
Train a WaveNet model with LJSpeech.
|
||||
|
||||
positional arguments:
|
||||
output path to save results
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--data DATA path of the LJspeech dataset.
|
||||
--config CONFIG path of the config file.
|
||||
--output OUTPUT path to save results.
|
||||
--device DEVICE device to use.
|
||||
--resume RESUME checkpoint to resume from.
|
||||
-h, --help show this help message and exit
|
||||
--data DATA path of the LJspeech dataset
|
||||
--config CONFIG path of the config file
|
||||
--device DEVICE device to use
|
||||
--checkpoint CHECKPOINT checkpoint to resume from
|
||||
--iteration ITERATION the iteration of the checkpoint to load from output directory
|
||||
```
|
||||
|
||||
- `--config` is the configuration file to use. The provided configurations can be used directly. And you can change some values in the configuration file and train the model with a different config.
|
||||
- `--data` is the path of the LJSpeech dataset, the extracted folder from the downloaded archive (the folder which contains metadata.txt).
|
||||
- `--resume` is the path of the checkpoint. If it is provided, the model would load the checkpoint before training.
|
||||
- `--output` is the directory to save results, all result are saved in this directory. The structure of the output directory is shown below.
|
||||
|
||||
```text
|
||||
├── checkpoints # checkpoint
|
||||
└── log # tensorboard log
|
||||
```
|
||||
|
||||
- `--config` is the configuration file to use. The provided configurations can be used directly. And you can change some values in the configuration file and train the model with a different config.
|
||||
- `--device` is the device (gpu id) to use for training. `-1` means CPU.
|
||||
|
||||
- `--checkpoint` is the path of the checkpoint.
|
||||
- `--iteration` is the iteration of the checkpoint to load from output directory.
|
||||
- `output` is the directory to save results, all result are saved in this directory.
|
||||
|
||||
See [Saving-&-Loading](#Saving-&-Loading) for details of checkpoint loading.
|
||||
|
||||
|
||||
Example script:
|
||||
|
||||
```bash
|
||||
python train.py --config=./configs/wavenet_single_gaussian.yaml --data=./LJSpeech-1.1/ --output=experiment --device=0
|
||||
python train.py \
|
||||
--config=./configs/wavenet_single_gaussian.yaml \
|
||||
--data=./LJSpeech-1.1/ \
|
||||
--device=0 \
|
||||
experiment
|
||||
```
|
||||
|
||||
You can monitor training log via TensorBoard, using the script below.
|
||||
|
@ -69,29 +95,50 @@ tensorboard --logdir=.
|
|||
## Synthesis
|
||||
```text
|
||||
usage: synthesis.py [-h] [--data DATA] [--config CONFIG] [--device DEVICE]
|
||||
checkpoint output
|
||||
[--checkpoint CHECKPOINT | --iteration ITERATION]
|
||||
output
|
||||
|
||||
Synthesize valid data from LJspeech with a WaveNet model.
|
||||
Synthesize valid data from LJspeech with a wavenet model.
|
||||
|
||||
positional arguments:
|
||||
checkpoint checkpoint to load.
|
||||
output path to save results.
|
||||
output path to save the synthesized audio
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--data DATA path of the LJspeech dataset.
|
||||
--config CONFIG path of the config file.
|
||||
--device DEVICE device to use.
|
||||
-h, --help show this help message and exit
|
||||
--data DATA path of the LJspeech dataset
|
||||
--config CONFIG path of the config file
|
||||
--device DEVICE device to use
|
||||
--checkpoint CHECKPOINT checkpoint to resume from
|
||||
--iteration ITERATION the iteration of the checkpoint to load from output directory
|
||||
```
|
||||
|
||||
- `--data` is the path of the LJspeech dataset. In principle, a dataset is not needed for synthesis, but since the input is mel spectrogram, we need to get mel spectrogram from audio files.
|
||||
- `--config` is the configuration file to use. You should use the same configuration with which you train you model.
|
||||
- `--data` is the path of the LJspeech dataset. A dataset is not needed for synthesis, but since the input is mel spectrogram, we need to get mel spectrogram from audio files.
|
||||
- `checkpoint` is the checkpoint to load.
|
||||
- `output_path` is the directory to save results. The output path contains the generated audio files (`*.wav`).
|
||||
- `--device` is the device (gpu id) to use for training. `-1` means CPU.
|
||||
- `--checkpoint` is the checkpoint to load.
|
||||
- `--iteration` is the iteration of the checkpoint to load from output directory.
|
||||
- `output` is the directory to save synthesized audio. Audio file is saved in `synthesis/` in `output` directory.
|
||||
See [Saving-&-Loading](#Saving-&-Loading) for details of checkpoint loading.
|
||||
|
||||
|
||||
Example script:
|
||||
|
||||
```bash
|
||||
python synthesis.py --config=./configs/wavenet_single_gaussian.yaml --data=./LJSpeech-1.1/ --device=0 experiment/checkpoints/step_500000 generated
|
||||
python synthesis.py \
|
||||
--config=./configs/wavenet_single_gaussian.yaml \
|
||||
--data=./LJSpeech-1.1/ \
|
||||
--device=0 \
|
||||
--checkpoint="experiment/checkpoints/step-1000000" \
|
||||
experiment
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```bash
|
||||
python synthesis.py \
|
||||
--config=./configs/wavenet_single_gaussian.yaml \
|
||||
--data=./LJSpeech-1.1/ \
|
||||
--device=0 \
|
||||
--iteration=1000000 \
|
||||
experiment
|
||||
```
|
||||
|
|
|
@ -21,25 +21,35 @@ from tensorboardX import SummaryWriter
|
|||
from paddle import fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
||||
from parakeet.modules.weight_norm import WeightNormWrapper
|
||||
from parakeet.data import SliceDataset, TransformDataset, DataCargo, SequentialSampler, RandomSampler
|
||||
from parakeet.models.wavenet import UpsampleNet, WaveNet, ConditionalWavenet
|
||||
from parakeet.utils.layer_tools import summary
|
||||
from parakeet.utils import io
|
||||
|
||||
from data import LJSpeechMetaData, Transform, DataCollector
|
||||
from utils import make_output_tree, valid_model, eval_model, save_checkpoint
|
||||
from utils import make_output_tree, valid_model, eval_model
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize valid data from LJspeech with a wavenet model.")
|
||||
parser.add_argument(
|
||||
"--data", type=str, help="path of the LJspeech dataset.")
|
||||
parser.add_argument("--config", type=str, help="path of the config file.")
|
||||
parser.add_argument(
|
||||
"--device", type=int, default=-1, help="device to use.")
|
||||
"--data", type=str, help="path of the LJspeech dataset")
|
||||
parser.add_argument("--config", type=str, help="path of the config file")
|
||||
parser.add_argument("--device", type=int, default=-1, help="device to use")
|
||||
|
||||
g = parser.add_mutually_exclusive_group()
|
||||
g.add_argument("--checkpoint", type=str, help="checkpoint to resume from")
|
||||
g.add_argument(
|
||||
"--iteration",
|
||||
type=int,
|
||||
help="the iteration of the checkpoint to load from output directory")
|
||||
|
||||
parser.add_argument("checkpoint", type=str, help="checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"output", type=str, default="experiment", help="path to save results.")
|
||||
"output",
|
||||
type=str,
|
||||
default="experiment",
|
||||
help="path to save the synthesized audio")
|
||||
|
||||
args = parser.parse_args()
|
||||
with open(args.config, 'rt') as f:
|
||||
|
@ -86,7 +96,8 @@ if __name__ == "__main__":
|
|||
batch_size=1,
|
||||
sampler=SequentialSampler(ljspeech_valid))
|
||||
|
||||
make_output_tree(args.output)
|
||||
if not os.path.exists(args.output):
|
||||
os.makedirs(args.output)
|
||||
|
||||
if args.device == -1:
|
||||
place = fluid.CPUPlace()
|
||||
|
@ -110,9 +121,21 @@ if __name__ == "__main__":
|
|||
model = ConditionalWavenet(encoder, decoder)
|
||||
summary(model)
|
||||
|
||||
model_dict, _ = dg.load_dygraph(args.checkpoint)
|
||||
print("Loading from {}.pdparams".format(args.checkpoint))
|
||||
model.set_dict(model_dict)
|
||||
# load model parameters
|
||||
checkpoint_dir = os.path.join(args.output, "checkpoints")
|
||||
if args.checkpoint:
|
||||
iteration = io.load_parameters(
|
||||
model, checkpoint_path=args.checkpoint)
|
||||
else:
|
||||
iteration = io.load_parameters(
|
||||
model, checkpoint_dir=checkpoint_dir, iteration=args.iteration)
|
||||
assert iteration > 0, "A trained model is needed."
|
||||
|
||||
# WARNING: don't forget to remove weight norm to re-compute each wrapped layer's weight
|
||||
# removing weight norm also speeds up computation
|
||||
for layer in model.sublayers():
|
||||
if isinstance(layer, WeightNormWrapper):
|
||||
layer.remove_weight_norm()
|
||||
|
||||
train_loader = fluid.io.DataLoader.from_generator(
|
||||
capacity=10, return_list=True)
|
||||
|
@ -122,4 +145,8 @@ if __name__ == "__main__":
|
|||
capacity=10, return_list=True)
|
||||
valid_loader.set_batch_generator(valid_cargo, place)
|
||||
|
||||
eval_model(model, valid_loader, args.output, sample_rate)
|
||||
synthesis_dir = os.path.join(args.output, "synthesis")
|
||||
if not os.path.exists(synthesis_dir):
|
||||
os.makedirs(synthesis_dir)
|
||||
|
||||
eval_model(model, valid_loader, synthesis_dir, iteration, sample_rate)
|
||||
|
|
|
@ -16,7 +16,7 @@ from __future__ import division
|
|||
import os
|
||||
import ruamel.yaml
|
||||
import argparse
|
||||
from tqdm import tqdm
|
||||
import tqdm
|
||||
from tensorboardX import SummaryWriter
|
||||
from paddle import fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
@ -24,30 +24,37 @@ import paddle.fluid.dygraph as dg
|
|||
from parakeet.data import SliceDataset, TransformDataset, DataCargo, SequentialSampler, RandomSampler
|
||||
from parakeet.models.wavenet import UpsampleNet, WaveNet, ConditionalWavenet
|
||||
from parakeet.utils.layer_tools import summary
|
||||
from parakeet.utils import io
|
||||
|
||||
from data import LJSpeechMetaData, Transform, DataCollector
|
||||
from utils import make_output_tree, valid_model, save_checkpoint
|
||||
from utils import make_output_tree, valid_model
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Train a wavenet model with LJSpeech.")
|
||||
description="Train a WaveNet model with LJSpeech.")
|
||||
parser.add_argument(
|
||||
"--data", type=str, help="path of the LJspeech dataset.")
|
||||
parser.add_argument("--config", type=str, help="path of the config file.")
|
||||
"--data", type=str, help="path of the LJspeech dataset")
|
||||
parser.add_argument("--config", type=str, help="path of the config file")
|
||||
parser.add_argument("--device", type=int, default=-1, help="device to use")
|
||||
|
||||
g = parser.add_mutually_exclusive_group()
|
||||
g.add_argument("--checkpoint", type=str, help="checkpoint to resume from")
|
||||
g.add_argument(
|
||||
"--iteration",
|
||||
type=int,
|
||||
help="the iteration of the checkpoint to load from output directory")
|
||||
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="experiment",
|
||||
help="path to save results.")
|
||||
parser.add_argument(
|
||||
"--device", type=int, default=-1, help="device to use.")
|
||||
parser.add_argument(
|
||||
"--resume", type=str, help="checkpoint to resume from.")
|
||||
"output", type=str, default="experiment", help="path to save results")
|
||||
|
||||
args = parser.parse_args()
|
||||
with open(args.config, 'rt') as f:
|
||||
config = ruamel.yaml.safe_load(f)
|
||||
|
||||
print("Command Line Args: ")
|
||||
for k, v in vars(args).items():
|
||||
print("{}: {}".format(k, v))
|
||||
|
||||
ljspeech_meta = LJSpeechMetaData(args.data)
|
||||
|
||||
data_config = config["data"]
|
||||
|
@ -126,14 +133,6 @@ if __name__ == "__main__":
|
|||
clipper = fluid.dygraph_grad_clip.GradClipByGlobalNorm(
|
||||
gradiant_max_norm)
|
||||
|
||||
if args.resume:
|
||||
model_dict, optim_dict = dg.load_dygraph(args.resume)
|
||||
print("Loading from {}.pdparams".format(args.resume))
|
||||
model.set_dict(model_dict)
|
||||
if optim_dict:
|
||||
optim.set_dict(optim_dict)
|
||||
print("Loading from {}.pdopt".format(args.resume))
|
||||
|
||||
train_loader = fluid.io.DataLoader.from_generator(
|
||||
capacity=10, return_list=True)
|
||||
train_loader.set_batch_generator(train_cargo, place)
|
||||
|
@ -150,33 +149,48 @@ if __name__ == "__main__":
|
|||
log_dir = os.path.join(args.output, "log")
|
||||
writer = SummaryWriter(log_dir)
|
||||
|
||||
global_step = 1
|
||||
# load parameters and optimizer, and opdate iterations done sofar
|
||||
if args.checkpoint is not None:
|
||||
iteration = io.load_parameters(
|
||||
model, optim, checkpoint_path=args.checkpoint)
|
||||
else:
|
||||
iteration = io.load_parameters(
|
||||
model,
|
||||
optim,
|
||||
checkpoint_dir=checkpoint_dir,
|
||||
iteration=args.iteration)
|
||||
|
||||
global_step = iteration + 1
|
||||
iterator = iter(tqdm.tqdm(train_loader))
|
||||
while global_step <= max_iterations:
|
||||
epoch_loss = 0.
|
||||
for i, batch in tqdm(enumerate(train_loader)):
|
||||
audio_clips, mel_specs, audio_starts = batch
|
||||
try:
|
||||
batch = next(iterator)
|
||||
except StopIteration as e:
|
||||
iterator = iter(tqdm.tqdm(train_loader))
|
||||
batch = next(iterator)
|
||||
|
||||
model.train()
|
||||
y_var = model(audio_clips, mel_specs, audio_starts)
|
||||
loss_var = model.loss(y_var, audio_clips)
|
||||
loss_var.backward()
|
||||
loss_np = loss_var.numpy()
|
||||
audio_clips, mel_specs, audio_starts = batch
|
||||
|
||||
epoch_loss += loss_np[0]
|
||||
model.train()
|
||||
y_var = model(audio_clips, mel_specs, audio_starts)
|
||||
loss_var = model.loss(y_var, audio_clips)
|
||||
loss_var.backward()
|
||||
loss_np = loss_var.numpy()
|
||||
|
||||
writer.add_scalar("loss", loss_np[0], global_step)
|
||||
writer.add_scalar("learning_rate",
|
||||
optim._learning_rate.step().numpy()[0],
|
||||
global_step)
|
||||
optim.minimize(loss_var, grad_clip=clipper)
|
||||
optim.clear_gradients()
|
||||
print("loss: {:<8.6f}".format(loss_np[0]))
|
||||
writer.add_scalar("loss", loss_np[0], global_step)
|
||||
writer.add_scalar("learning_rate",
|
||||
optim._learning_rate.step().numpy()[0],
|
||||
global_step)
|
||||
optim.minimize(loss_var, grad_clip=clipper)
|
||||
optim.clear_gradients()
|
||||
print("global_step: {}\tloss: {:<8.6f}".format(global_step,
|
||||
loss_np[0]))
|
||||
|
||||
if global_step % snap_interval == 0:
|
||||
valid_model(model, valid_loader, writer, global_step,
|
||||
sample_rate)
|
||||
if global_step % snap_interval == 0:
|
||||
valid_model(model, valid_loader, writer, global_step,
|
||||
sample_rate)
|
||||
|
||||
if global_step % checkpoint_interval == 0:
|
||||
save_checkpoint(model, optim, checkpoint_dir, global_step)
|
||||
if global_step % checkpoint_interval == 0:
|
||||
io.save_parameters(checkpoint_dir, global_step, model, optim)
|
||||
|
||||
global_step += 1
|
||||
global_step += 1
|
||||
|
|
|
@ -49,20 +49,14 @@ def valid_model(model, valid_loader, writer, global_step, sample_rate):
|
|||
sample_rate)
|
||||
|
||||
|
||||
def eval_model(model, valid_loader, output_dir, sample_rate):
|
||||
def eval_model(model, valid_loader, output_dir, global_step, sample_rate):
|
||||
model.eval()
|
||||
for i, batch in enumerate(valid_loader):
|
||||
# print("sentence {}".format(i))
|
||||
path = os.path.join(output_dir, "sentence_{}.wav".format(i))
|
||||
path = os.path.join(output_dir,
|
||||
"sentence_{}_step_{}.wav".format(i, global_step))
|
||||
audio_clips, mel_specs, audio_starts = batch
|
||||
wav_var = model.synthesis(mel_specs)
|
||||
wav_np = wav_var.numpy()[0]
|
||||
sf.write(path, wav_np, samplerate=sample_rate)
|
||||
print("generated {}".format(path))
|
||||
|
||||
|
||||
def save_checkpoint(model, optim, checkpoint_dir, global_step):
|
||||
checkpoint_path = os.path.join(checkpoint_dir,
|
||||
"step_{:09d}".format(global_step))
|
||||
dg.save_dygraph(model.state_dict(), checkpoint_path)
|
||||
dg.save_dygraph(optim.state_dict(), checkpoint_path)
|
||||
|
|
|
@ -12,4 +12,4 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.models.waveflow.waveflow import WaveFlow
|
||||
from parakeet.models.waveflow.waveflow_modules import WaveFlowLoss, WaveFlowModule
|
||||
|
|
|
@ -313,6 +313,7 @@ class WaveNet(dg.Layer):
|
|||
"""
|
||||
# Causal Conv
|
||||
if self.loss_type == "softmax":
|
||||
x = F.clip(x, min=-1., max=0.99999)
|
||||
x = quantize(x, self.output_dim)
|
||||
x = self.embed(x) # (B, T, C), T=1
|
||||
else:
|
||||
|
|
|
@ -86,7 +86,7 @@ class Conv1D(dg.Conv2D):
|
|||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=None,
|
||||
groups=1,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
use_cudnn=True,
|
||||
|
@ -128,7 +128,7 @@ class Conv1DTranspose(dg.Conv2DTranspose):
|
|||
padding=0,
|
||||
stride=1,
|
||||
dilation=1,
|
||||
groups=None,
|
||||
groups=1,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
use_cudnn=True,
|
||||
|
@ -179,7 +179,7 @@ class Conv1DCell(Conv1D):
|
|||
filter_size,
|
||||
dilation=1,
|
||||
causal=False,
|
||||
groups=None,
|
||||
groups=1,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
use_cudnn=True,
|
||||
|
@ -225,6 +225,12 @@ class Conv1DCell(Conv1D):
|
|||
|
||||
def start_sequence(self):
|
||||
"""Prepare the Conv1DCell to generate a new sequence, this method should be called before calling add_input multiple times.
|
||||
|
||||
WARNING:
|
||||
This method accesses `self.weight` directly. If a `Conv1DCell` object is wrapped in a `WeightNormWrapper`, make sure this method is called only after the `WeightNormWrapper`'s hook is called.
|
||||
`WeightNormWrapper` removes the wrapped layer's `weight`, add has a `weight_v` and `weight_g` to re-compute the wrapped layer's weight as $weight = weight_g * weight_v / ||weight_v||$. (Recomputing the `weight` is a hook before calling the wrapped layer's `forward` method.)
|
||||
Whenever a `WeightNormWrapper`'s `forward` method is called, the wrapped layer's weight is updated. But when loading from a checkpoint, `weight_v` and `weight_g` are updated but the wrapped layer's weight is not, since it is no longer a `Parameter`. You should manually call `remove_weight_norm` or `hook` to re-compute the wrapped layer's weight before calling this method if you don't call `forward` first.
|
||||
So when loading a model which uses `Conv1DCell` objects wrapped in `WeightNormWrapper`s, remember to call `remove_weight_norm` for all `WeightNormWrapper`s before synthesizing. Also, removing weight norm speeds up computation.
|
||||
"""
|
||||
if not self.causal:
|
||||
raise ValueError(
|
||||
|
|
|
@ -151,7 +151,7 @@ def Conv1D(num_channels,
|
|||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=None,
|
||||
groups=1,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
use_cudnn=True,
|
||||
|
@ -170,7 +170,7 @@ def Conv1DTranspose(num_channels,
|
|||
padding=0,
|
||||
stride=1,
|
||||
dilation=1,
|
||||
groups=None,
|
||||
groups=1,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
use_cudnn=True,
|
||||
|
@ -188,7 +188,7 @@ def Conv1DCell(num_channels,
|
|||
filter_size,
|
||||
dilation=1,
|
||||
causal=False,
|
||||
groups=None,
|
||||
groups=1,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
use_cudnn=True,
|
||||
|
@ -207,7 +207,7 @@ def Conv2D(num_channels,
|
|||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=None,
|
||||
groups=1,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
use_cudnn=True,
|
||||
|
@ -228,7 +228,7 @@ def Conv2DTranspose(num_channels,
|
|||
padding=0,
|
||||
stride=1,
|
||||
dilation=1,
|
||||
groups=None,
|
||||
groups=1,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
use_cudnn=True,
|
||||
|
|
|
@ -0,0 +1,170 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
import ruamel.yaml
|
||||
import numpy as np
|
||||
import paddle.fluid.dygraph as dg
|
||||
from paddle.fluid.framework import convert_np_dtype_to_dtype_ as convert_np_dtype
|
||||
|
||||
|
||||
def is_main_process():
|
||||
local_rank = dg.parallel.Env().local_rank
|
||||
return local_rank == 0
|
||||
|
||||
|
||||
def add_yaml_config_to_args(config):
|
||||
""" Add args in yaml config to the args parsed by argparse. The argument in
|
||||
yaml config will be overwritten by the same argument in argparse if they
|
||||
are both valid.
|
||||
|
||||
Args:
|
||||
config (args): the args returned by `argparse.ArgumentParser().parse_args()`
|
||||
|
||||
Returns:
|
||||
config: the args added yaml config.
|
||||
"""
|
||||
with open(config.config, 'rt') as f:
|
||||
yaml_cfg = ruamel.yaml.safe_load(f)
|
||||
cfg_vars = vars(config)
|
||||
for k, v in yaml_cfg.items():
|
||||
if k in cfg_vars and cfg_vars[k] is not None:
|
||||
continue
|
||||
cfg_vars[k] = v
|
||||
return config
|
||||
|
||||
|
||||
def _load_latest_checkpoint(checkpoint_dir):
|
||||
"""Get the iteration number corresponding to the latest saved checkpoint
|
||||
|
||||
Args:
|
||||
checkpoint_dir (str): the directory where checkpoint is saved.
|
||||
|
||||
Returns:
|
||||
int: the latest iteration number.
|
||||
"""
|
||||
checkpoint_record = os.path.join(checkpoint_dir, "checkpoint")
|
||||
# Create checkpoint index file if not exist.
|
||||
if (not os.path.isfile(checkpoint_record)):
|
||||
return 0
|
||||
|
||||
# Fetch the latest checkpoint index.
|
||||
with open(checkpoint_record, "r") as handle:
|
||||
latest_checkpoint = handle.readline().split()[-1]
|
||||
iteration = int(latest_checkpoint.split("-")[-1])
|
||||
|
||||
return iteration
|
||||
|
||||
|
||||
def _save_checkpoint(checkpoint_dir, iteration):
|
||||
"""Save the iteration number of the latest model to be checkpointed.
|
||||
|
||||
Args:
|
||||
checkpoint_dir (str): the directory where checkpoint is saved.
|
||||
iteration (int): the latest iteration number.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
checkpoint_record = os.path.join(checkpoint_dir, "checkpoint")
|
||||
# Update the latest checkpoint index.
|
||||
with open(checkpoint_record, "w") as handle:
|
||||
handle.write("model_checkpoint_path: step-{}".format(iteration))
|
||||
|
||||
|
||||
def load_parameters(model,
|
||||
optimizer=None,
|
||||
checkpoint_dir=None,
|
||||
iteration=None,
|
||||
checkpoint_path=None):
|
||||
"""Load a specific model checkpoint from disk.
|
||||
|
||||
Args:
|
||||
model (obj): model to load parameters.
|
||||
optimizer (obj, optional): optimizer to load states if needed.
|
||||
Defaults to None.
|
||||
checkpoint_dir (str, optional): the directory where checkpoint is saved.
|
||||
iteration (int, optional): if specified, load the specific checkpoint,
|
||||
if not specified, load the latest one. Defaults to None.
|
||||
checkpoint_path (str, optional): if specified, load the checkpoint
|
||||
stored in the checkpoint_path and the argument 'checkpoint_dir' will
|
||||
be ignored. Defaults to None.
|
||||
|
||||
Returns:
|
||||
iteration (int): number of iterations that the loaded checkpoint has
|
||||
been trained.
|
||||
"""
|
||||
if checkpoint_path is not None:
|
||||
iteration = int(os.path.basename(checkpoint_path).split("-")[-1])
|
||||
elif checkpoint_dir is not None:
|
||||
if iteration is None:
|
||||
iteration = _load_latest_checkpoint(checkpoint_dir)
|
||||
if iteration == 0:
|
||||
return iteration
|
||||
checkpoint_path = os.path.join(checkpoint_dir,
|
||||
"step-{}".format(iteration))
|
||||
else:
|
||||
raise ValueError(
|
||||
"At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!"
|
||||
)
|
||||
|
||||
local_rank = dg.parallel.Env().local_rank
|
||||
model_dict, optimizer_dict = dg.load_dygraph(checkpoint_path)
|
||||
|
||||
state_dict = model.state_dict()
|
||||
# cast to desired data type, for mixed-precision training/inference.
|
||||
for k, v in model_dict.items():
|
||||
if k in state_dict and convert_np_dtype(v.dtype) != state_dict[
|
||||
k].dtype:
|
||||
model_dict[k] = v.astype(state_dict[k].numpy().dtype)
|
||||
|
||||
model.set_dict(model_dict)
|
||||
print("[checkpoint] Rank {}: loaded model from {}.pdparams".format(
|
||||
local_rank, checkpoint_path))
|
||||
|
||||
if optimizer and optimizer_dict:
|
||||
optimizer.set_dict(optimizer_dict)
|
||||
print("[checkpoint] Rank {}: loaded optimizer state from {}.pdopt".
|
||||
format(local_rank, checkpoint_path))
|
||||
|
||||
return iteration
|
||||
|
||||
|
||||
def save_parameters(checkpoint_dir, iteration, model, optimizer=None):
|
||||
"""Checkpoint the latest trained model parameters.
|
||||
|
||||
Args:
|
||||
checkpoint_dir (str): the directory where checkpoint is saved.
|
||||
iteration (int): the latest iteration number.
|
||||
model (obj): model to be checkpointed.
|
||||
optimizer (obj, optional): optimizer to be checkpointed.
|
||||
Defaults to None.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
checkpoint_path = os.path.join(checkpoint_dir, "step-{}".format(iteration))
|
||||
model_dict = model.state_dict()
|
||||
dg.save_dygraph(model_dict, checkpoint_path)
|
||||
print("[checkpoint] Saved model to {}.pdparams".format(checkpoint_path))
|
||||
|
||||
if optimizer:
|
||||
opt_dict = optimizer.state_dict()
|
||||
dg.save_dygraph(opt_dict, checkpoint_path)
|
||||
print("[checkpoint] Saved optimzier state to {}.pdopt".format(
|
||||
checkpoint_path))
|
||||
|
||||
_save_checkpoint(checkpoint_dir, iteration)
|
Loading…
Reference in New Issue