Compare commits
157 Commits
release/v0
...
develop
Author | SHA1 | Date |
---|---|---|
TianYuan | 7334e1dcc5 | |
Hui Zhang | 035ab2c967 | |
TianYuan | 53365ac959 | |
TianYuan | ea8cec2b5e | |
TianYuan | b59fa0f861 | |
TianYuan | 50bcf7c261 | |
TianYuan | 4860d06dba | |
TianYuan | 5bd64d3869 | |
TianYuan | db4122f047 | |
TianYuan | a9f0f5c3c0 | |
TianYuan | 8e7c916443 | |
TianYuan | a5bc4ba7ec | |
chenfeiyu | 616c7f495e | |
TianYuan | def8218d33 | |
TianYuan | 065fa32a37 | |
chenfeiyu | 0dcb7f313f | |
TianYuan | 5bc570aee5 | |
TianYuan | 91f5b5bbdb | |
TianYuan | 22e5527c84 | |
TianYuan | 90dbb377b0 | |
TianYuan | 06cbdb1ebd | |
TianYuan | cf26e2932b | |
TianYuan | d1163daa70 | |
TianYuan | db34278d61 | |
TianYuan | cd28cba04d | |
TianYuan | 372208dd5b | |
TianYuan | 7b2fcbd4c9 | |
TianYuan | 360567ca20 | |
TianYuan | 4e19792ea2 | |
TianYuan | 1ed9cb0e5f | |
TianYuan | 1c853223ad | |
TianYuan | 3d10fec409 | |
TianYuan | 0fec449368 | |
TianYuan | 838d56ba6e | |
Hui Zhang | 070bf8b760 | |
Hui Zhang | c4615e3bba | |
TianYuan | 36604b4e41 | |
Feiyu Chan | b9a30eab9c | |
iclementine | b959b14409 | |
TianYuan | 106891f443 | |
iclementine | e3c024dd52 | |
iclementine | 2e9ffcb6d0 | |
Hui Zhang | 5232f59840 | |
TianYuan | 8a211abb70 | |
Hui Zhang | b4b9171250 | |
TianYuan | d88a448d3c | |
TianYuan | ad74b7a120 | |
TianYuan | 206452fcf0 | |
TianYuan | 437f6a2454 | |
TianYuan | 2482b112c0 | |
Hui Zhang | 70b6ce64e4 | |
TianYuan | cdd431e95b | |
Hui Zhang | 24c5b3c1a2 | |
TianYuan | c497fd843d | |
TianYuan | 30f344a6d0 | |
Hui Zhang | 82d5139f80 | |
TianYuan | 9ca5ce0128 | |
Hui Zhang | 48c65f4ab5 | |
TianYuan | 19631f4eab | |
TianYuan | e8991c973c | |
Hui Zhang | 97b7000aa2 | |
chenfeiyu | 7cb0b501a5 | |
chenfeiyu | e3f4923ed7 | |
chenfeiyu | f563b7de99 | |
chenfeiyu | 9425c779a0 | |
TianYuan | 309228ddbf | |
TianYuan | 796fafbac8 | |
Hui Zhang | 3ac2e01263 | |
TianYuan | a22b4dd171 | |
chenfeiyu | b452586fcf | |
Hui Zhang | 5e35a696e4 | |
TianYuan | 2eb899b0b7 | |
TianYuan | a141d39b38 | |
Hui Zhang | ffe65c89de | |
TianYuan | 6aeb56301f | |
chenfeiyu | 9e0050b927 | |
chenfeiyu | f71d599476 | |
Feiyu Chan | 093c2e53f5 | |
chenfeiyu | d05ee1d7d5 | |
TianYuan | 3d39385d5e | |
chenfeiyu | 133294340c | |
TianYuan | 47ec051136 | |
Hui Zhang | 25788ab2ca | |
TianYuan | 474bc4c06a | |
chenfeiyu | 4ba8e7e342 | |
chenfeiyu | 51397f8500 | |
TianYuan | 6553d1d723 | |
chenfeiyu | 0dec9221bb | |
chenfeiyu | a62eeb9b06 | |
chenfeiyu | acc02c9b79 | |
chenfeiyu | 8b7dabbd8d | |
chenfeiyu | 4a7888b8c6 | |
TianYuan | 3af3c29a94 | |
chenfeiyu | 6c21d80025 | |
Feiyu Chan | 124dedbd7b | |
chenfeiyu | 7522c3eaba | |
Feiyu Chan | dd7f2a6d2e | |
chenfeiyu | 26b4cf153d | |
chenfeiyu | fa0d7935d1 | |
Feiyu Chan | 9ffdd7abd8 | |
Feiyu Chan | 109195a5c0 | |
Feiyu Chan | 0b5eb96d7e | |
chenfeiyu | 47a9ab3a0b | |
chenfeiyu | 96b8e44015 | |
Hui Zhang | 68e9a84ada | |
chenfeiyu | e41423caf0 | |
chenfeiyu | 3ebed00c96 | |
chenfeiyu | e6554abe05 | |
chenfeiyu | afe9d4a4f1 | |
chenfeiyu | dd6772bc3e | |
chenfeiyu | a93fad051c | |
chenfeiyu | af26c1e389 | |
chenfeiyu | ef51e1ab13 | |
Feiyu Chan | de61534153 | |
chenfeiyu | 29b8b8b0ea | |
iclementine | 61c13dd69b | |
iclementine | f9105db727 | |
chenfeiyu | 3e8a156348 | |
chenfeiyu | 577c3b4f10 | |
chenfeiyu | 7e049a7744 | |
chenfeiyu | 542bbf6a81 | |
liangyunming | ea5cb8e71f | |
chenfeiyu | b5f99a925f | |
chenfeiyu | 83c9f0aeae | |
chenfeiyu | a738954001 | |
chenfeiyu | 3977632b07 | |
chenfeiyu | 7ac0d3ce12 | |
chenfeiyu | fbc7e51fc9 | |
chenfeiyu | 30045cf602 | |
chenfeiyu | 58a988c789 | |
chenfeiyu | 27d3585606 | |
chenfeiyu | 683cc1d30f | |
chenfeiyu | 042e02d242 | |
chenfeiyu | 8dbcc9bccb | |
Hui Zhang | d96e2828b8 | |
chenfeiyu | bbbe5a8b50 | |
chenfeiyu | 95f64c4f02 | |
chenfeiyu | b0983e4d76 | |
chenfeiyu | 54c7905f40 | |
chenfeiyu | 0067851950 | |
chenfeiyu | 66062d29e5 | |
chenfeiyu | 258083aea9 | |
chenfeiyu | f0a5ac8c5a | |
chenfeiyu | 13ab0bd608 | |
chenfeiyu | 3bf2e71734 | |
chenfeiyu | 60c16dcfb7 | |
chenfeiyu | 988d6d3268 | |
chenfeiyu | 8e31783b51 | |
chenfeiyu | 6a8b3f92df | |
chenfeiyu | 3c964fde54 | |
chenfeiyu | 759999c738 | |
chenfeiyu | c306f5c2b3 | |
chenfeiyu | dc9040dd4d | |
chenfeiyu | 0114a808a2 | |
chenfeiyu | 13323bdf6a | |
chenfeiyu | 37a66f1506 | |
chenfeiyu | b571b506c3 |
|
@ -0,0 +1,28 @@
|
|||
# This file is used by clang-format to autoformat paddle source code
|
||||
#
|
||||
# The clang-format is part of llvm toolchain.
|
||||
# It need to install llvm and clang to format source code style.
|
||||
#
|
||||
# The basic usage is,
|
||||
# clang-format -i -style=file PATH/TO/SOURCE/CODE
|
||||
#
|
||||
# The -style=file implicit use ".clang-format" file located in one of
|
||||
# parent directory.
|
||||
# The -i means inplace change.
|
||||
#
|
||||
# The document of clang-format is
|
||||
# http://clang.llvm.org/docs/ClangFormat.html
|
||||
# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
|
||||
---
|
||||
Language: Cpp
|
||||
BasedOnStyle: Google
|
||||
IndentWidth: 4
|
||||
TabWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
MaxEmptyLinesToKeep: 2
|
||||
AccessModifierOffset: -2 # The private/protected/public has no indent in class
|
||||
Standard: Cpp11
|
||||
AllowAllParametersOfDeclarationOnNextLine: true
|
||||
BinPackParameters: false
|
||||
BinPackArguments: false
|
||||
...
|
|
@ -0,0 +1,54 @@
|
|||
[flake8]
|
||||
|
||||
########## OPTIONS ##########
|
||||
# Set the maximum length that any line (with some exceptions) may be.
|
||||
max-line-length = 120
|
||||
|
||||
|
||||
################### FILE PATTERNS ##########################
|
||||
# Provide a comma-separated list of glob patterns to exclude from checks.
|
||||
exclude =
|
||||
# git folder
|
||||
.git,
|
||||
# python cache
|
||||
__pycache__,
|
||||
third_party/,
|
||||
# Provide a comma-separate list of glob patterns to include for checks.
|
||||
filename =
|
||||
*.py
|
||||
|
||||
|
||||
########## RULES ##########
|
||||
|
||||
# ERROR CODES
|
||||
#
|
||||
# E/W - PEP8 errors/warnings (pycodestyle)
|
||||
# F - linting errors (pyflakes)
|
||||
# C - McCabe complexity error (mccabe)
|
||||
#
|
||||
# W503 - line break before binary operator
|
||||
|
||||
# Specify a list of codes to ignore.
|
||||
ignore =
|
||||
W503
|
||||
E252,E262,E127,E265,E126,E266,E241,E261,E128,E125
|
||||
W291,W293,W605
|
||||
E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
|
||||
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
|
||||
# to line this up with executable bit
|
||||
EXE001,
|
||||
# these ignores are from flake8-bugbear; please fix!
|
||||
B007,B008,
|
||||
# these ignores are from flake8-comprehensions; please fix!
|
||||
C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415
|
||||
|
||||
|
||||
per-file-ignores =
|
||||
*/__init__.py: F401
|
||||
|
||||
# Specify the list of error codes you wish Flake8 to report.
|
||||
select =
|
||||
E,
|
||||
W,
|
||||
F,
|
||||
C
|
|
@ -142,3 +142,5 @@ dmypy.json
|
|||
*.swp
|
||||
runs
|
||||
syn_audios
|
||||
exp/
|
||||
dump/
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
- repo: https://github.com/PaddlePaddle/mirrors-yapf.git
|
||||
sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
|
||||
- repo: https://github.com/pre-commit/mirrors-yapf.git
|
||||
sha: v0.16.0
|
||||
hooks:
|
||||
- id: yapf
|
||||
files: \.py$
|
||||
exclude: (?=third_party).*(\.py)$
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
sha: a11d9314b22d8f8c7556443875b731ef05965464
|
||||
hooks:
|
||||
|
@ -14,7 +15,22 @@
|
|||
files: \.md$
|
||||
- id: trailing-whitespace
|
||||
files: \.md$
|
||||
- repo: https://github.com/Lucas-C/pre-commit-hooks
|
||||
- id: requirements-txt-fixer
|
||||
exclude: (?=third_party).*$
|
||||
- id: check-yaml
|
||||
- id: check-json
|
||||
- id: pretty-format-json
|
||||
args:
|
||||
- --no-sort-keys
|
||||
- --autofix
|
||||
- id: check-merge-conflict
|
||||
- id: flake8
|
||||
aergs:
|
||||
- --ignore=E501,E228,E226,E261,E266,E128,E402,W503
|
||||
- --builtins=G,request
|
||||
- --jobs=1
|
||||
exclude: (?=third_party).*(\.py)$
|
||||
- repo : https://github.com/Lucas-C/pre-commit-hooks
|
||||
sha: v1.0.1
|
||||
hooks:
|
||||
- id: forbid-crlf
|
||||
|
@ -27,9 +43,15 @@
|
|||
files: \.md$
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: clang-format
|
||||
name: clang-format
|
||||
description: Format files with ClangFormat
|
||||
entry: bash .pre-commit-hooks/clang-format.hook -i
|
||||
language: system
|
||||
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
|
||||
- id: copyright_checker
|
||||
name: copyright_checker
|
||||
entry: python ./tools/copyright.hook
|
||||
entry: python .pre-commit-hooks/copyright-check.hook
|
||||
language: system
|
||||
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
|
||||
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
|
||||
exclude: (?=third_party|pypinyin).*(\.cpp|\.h|\.py)$
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
readonly VERSION="3.9"
|
||||
|
||||
version=$(clang-format -version)
|
||||
|
||||
# if ! [[ $version == *"$VERSION"* ]]; then
|
||||
# echo "clang-format version check failed."
|
||||
# echo "a version contains '$VERSION' is needed, but get '$version'"
|
||||
# echo "you can install the right version, and make an soft-link to '\$PATH' env"
|
||||
# exit -1
|
||||
# fi
|
||||
|
||||
clang-format $@
|
|
@ -0,0 +1,133 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import subprocess
|
||||
import platform
|
||||
|
||||
COPYRIGHT = '''
|
||||
Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
LANG_COMMENT_MARK = None
|
||||
|
||||
NEW_LINE_MARK = None
|
||||
|
||||
COPYRIGHT_HEADER = None
|
||||
|
||||
if platform.system() == "Windows":
|
||||
NEW_LINE_MARK = "\r\n"
|
||||
else:
|
||||
NEW_LINE_MARK = '\n'
|
||||
COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
|
||||
p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
|
||||
process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
|
||||
date, err = process.communicate()
|
||||
date = date.decode("utf-8").rstrip("\n")
|
||||
COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
|
||||
|
||||
|
||||
def generate_copyright(template, lang='C'):
|
||||
if lang == 'Python':
|
||||
LANG_COMMENT_MARK = '#'
|
||||
else:
|
||||
LANG_COMMENT_MARK = "//"
|
||||
|
||||
lines = template.split(NEW_LINE_MARK)
|
||||
BLANK = " "
|
||||
ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
|
||||
for lino, line in enumerate(lines):
|
||||
if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
|
||||
if len(line) == 0:
|
||||
BLANK = ""
|
||||
else:
|
||||
BLANK = " "
|
||||
ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
|
||||
|
||||
return ans + "\n"
|
||||
|
||||
|
||||
def lang_type(filename):
|
||||
if filename.endswith(".py"):
|
||||
return "Python"
|
||||
elif filename.endswith(".h"):
|
||||
return "C"
|
||||
elif filename.endswith(".c"):
|
||||
return "C"
|
||||
elif filename.endswith(".hpp"):
|
||||
return "C"
|
||||
elif filename.endswith(".cc"):
|
||||
return "C"
|
||||
elif filename.endswith(".cpp"):
|
||||
return "C"
|
||||
elif filename.endswith(".cu"):
|
||||
return "C"
|
||||
elif filename.endswith(".cuh"):
|
||||
return "C"
|
||||
elif filename.endswith(".go"):
|
||||
return "C"
|
||||
elif filename.endswith(".proto"):
|
||||
return "C"
|
||||
else:
|
||||
print("Unsupported filetype %s", filename)
|
||||
exit(0)
|
||||
|
||||
|
||||
PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Checker for copyright declaration.')
|
||||
parser.add_argument('filenames', nargs='*', help='Filenames to check')
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
retv = 0
|
||||
for filename in args.filenames:
|
||||
fd = io.open(filename, encoding="utf-8")
|
||||
first_line = fd.readline()
|
||||
second_line = fd.readline()
|
||||
if "COPYRIGHT (C)" in first_line.upper(): continue
|
||||
if first_line.startswith("#!") or PYTHON_ENCODE.match(
|
||||
second_line) != None or PYTHON_ENCODE.match(first_line) != None:
|
||||
continue
|
||||
original_contents = io.open(filename, encoding="utf-8").read()
|
||||
new_contents = generate_copyright(
|
||||
COPYRIGHT, lang_type(filename)) + original_contents
|
||||
print('Auto Insert Copyright Header {}'.format(filename))
|
||||
retv = 1
|
||||
with io.open(filename, 'w') as output_file:
|
||||
output_file.write(new_contents)
|
||||
|
||||
return retv
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
exit(main())
|
|
@ -0,0 +1,3 @@
|
|||
[style]
|
||||
based_on_style = pep8
|
||||
column_limit = 80
|
107
README.md
|
@ -1,44 +1,49 @@
|
|||
# Parakeet
|
||||
|
||||
Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It is built on PaddlePaddle Fluid dynamic graph and includes many influential TTS models proposed by [Baidu Research](http://research.baidu.com) and other research groups.
|
||||
Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It is built on PaddlePaddle dynamic graph and includes many influential TTS models.
|
||||
|
||||
<div align="center">
|
||||
<img src="images/logo.png" width=300 /> <br>
|
||||
<img src="docs/images/logo.png" width=300 /> <br>
|
||||
</div>
|
||||
|
||||
In particular, it features the latest [WaveFlow](https://arxiv.org/abs/1912.01219) model proposed by Baidu Research.
|
||||
## News <img src="./docs/images/news_icon.png" width="40"/>
|
||||
|
||||
- WaveFlow can synthesize 22.05 kHz high-fidelity speech around 40x faster than real-time on a Nvidia V100 GPU without engineered inference kernels, which is faster than [WaveGlow](https://github.com/NVIDIA/waveglow) and serveral orders of magnitude faster than WaveNet.
|
||||
- WaveFlow is a small-footprint flow-based model for raw audio. It has only 5.9M parameters, which is 15x smalller than WaveGlow (87.9M).
|
||||
- WaveFlow is directly trained with maximum likelihood without probability density distillation and auxiliary losses as used in Parallel WaveNet and ClariNet, which simplifies the training pipeline and reduces the cost of development.
|
||||
- Aug-31-2021, Chinese Text Frontend. Check [examples/text_frontend](./examples/text_frontend).
|
||||
- Aug-23-2021, FastSpeech2 with AISHELL-3. Check [fastspeech2/aishell3](./examples/fastspeech2/aishell3).
|
||||
- Aug-3-2021, FastSpeech2 with CSMSC. Check [fastspeech2/baker](./examples/fastspeech2/baker).
|
||||
- Jul-19-2021, SpeedySpeech with CSMSC. Check [speedyspeech/baker](./examples/speedyspeech/baker).
|
||||
- Jul-01-2021, Parallel WaveGAN with CSMSC. Check [parallelwave_gan/baker](./examples/parallelwave_gan/baker).
|
||||
- Jul-01-2021, Montreal-Forced-Aligner. Check [examples/use_mfa](./examples/use_mfa).
|
||||
- May-07-2021, voice cloning in Chinese. Check [examples/tacotron2_aishell3](./examples/tacotron2_aishell3).
|
||||
|
||||
## Overview
|
||||
|
||||
In order to facilitate exploiting the existing TTS models directly and developing the new ones, Parakeet selects typical models and provides their reference implementations in PaddlePaddle. Further more, Parakeet abstracts the TTS pipeline and standardizes the procedure of data preprocessing, common modules sharing, model configuration, and the process of training and synthesis. The models supported here include Vocoders and end-to-end TTS models:
|
||||
In order to facilitate exploiting the existing TTS models directly and developing the new ones, Parakeet selects typical models and provides their reference implementations in PaddlePaddle. Further more, Parakeet abstracts the TTS pipeline and standardizes the procedure of data preprocessing, common modules sharing, model configuration, and the process of training and synthesis. The models supported here include Text FrontEnd, end-to-end Acoustic models and Vocoders:
|
||||
|
||||
* Text FrontEnd
|
||||
* Rule based frontend.
|
||||
|
||||
- Acoustic Models
|
||||
- [【FastSpeech2】FastSpeech 2: Fast and High-Quality End-to-End Text to Speech](https://arxiv.org/abs/2006.04558)
|
||||
- [【SpeedySpeech】SpeedySpeech: Efficient Neural Speech Synthesis](https://arxiv.org/abs/2008.03802)
|
||||
- [【Transformer TTS】Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895)
|
||||
- [【Tacotron2】Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884)
|
||||
- Vocoders
|
||||
- [WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219)
|
||||
|
||||
- TTS models
|
||||
- [Neural Speech Synthesis with Transformer Network (Transformer TTS)](https://arxiv.org/abs/1809.08895)
|
||||
- [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](arxiv.org/abs/1712.05884)
|
||||
|
||||
## Updates
|
||||
|
||||
May-07-2021, Add an example for voice cloning in Chinese. Check [examples/tacotron2_aishell3](./examples/tacotron2_aishell3).
|
||||
|
||||
- [【Parallel WaveGAN】Parallel WaveGAN: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram](https://arxiv.org/abs/1910.11480)
|
||||
- [【WaveFlow】WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219)
|
||||
- Voice Cloning
|
||||
- [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558v4.pdf)
|
||||
- [【GE2E】Generalized End-to-End Loss for Speaker Verification](https://arxiv.org/abs/1710.10467)
|
||||
|
||||
## Setup
|
||||
It's difficult to install some dependent libraries for this repo in Windows system, we recommend that you **DO NOT** use Windows system, please use `Linux`.
|
||||
|
||||
Make sure the library `libsndfile1` is installed, e.g., on Ubuntu.
|
||||
|
||||
```bash
|
||||
sudo apt-get install libsndfile1
|
||||
```
|
||||
|
||||
### Install PaddlePaddle
|
||||
|
||||
See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. This repo requires PaddlePaddle **2.0.0rc1** or above.
|
||||
See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. This repo requires PaddlePaddle **2.1.2** or above.
|
||||
|
||||
### Install Parakeet
|
||||
```bash
|
||||
|
@ -52,44 +57,72 @@ cd Parakeet
|
|||
pip install -e .
|
||||
```
|
||||
|
||||
If some python dependent packages cannot be installed successfully, you can run the following script first.
|
||||
(replace `python3.6` with your own python version)
|
||||
```bash
|
||||
sudo apt install -y python3.6-dev
|
||||
```
|
||||
|
||||
See [install](https://paddle-parakeet.readthedocs.io/en/latest/install.html) for more details.
|
||||
|
||||
## Examples
|
||||
|
||||
Entries to the introduction, and the launch of training and synthsis for different example models:
|
||||
|
||||
- [>>> WaveFlow](./examples/waveflow)
|
||||
- [>>> Transformer TTS](./examples/transformer_tts)
|
||||
- [>>> Tacotron2](./examples/tacotron2)
|
||||
- [>>> Chinese Text Frontend](./examples/text_frontend)
|
||||
- [>>> FastSpeech2](./examples/fastspeech2)
|
||||
- [>>> Montreal-Forced-Aligner](./examples/use_mfa)
|
||||
- [>>> Parallel WaveGAN](./examples/parallelwave_gan)
|
||||
- [>>> SpeedySpeech](./examples/speedyspeech)
|
||||
- [>>> Tacotron2_AISHELL3](./examples/tacotron2_aishell3)
|
||||
- [>>> GE2E](./examples/ge2e)
|
||||
|
||||
- [>>> WaveFlow](./examples/waveflow)
|
||||
- [>>> TransformerTTS](./examples/transformer_tts)
|
||||
- [>>> Tacotron2](./examples/tacotron2)
|
||||
|
||||
## Audio samples
|
||||
|
||||
### TTS models (Acoustic Model + Neural Vocoder)
|
||||
|
||||
Check our [website](https://paddle-parakeet.readthedocs.io/en/latest/demo.html) for audio sampels.
|
||||
|
||||
## Released Model
|
||||
|
||||
## Checkpoints
|
||||
### AM
|
||||
|
||||
#### FastSpeech2
|
||||
1. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
|
||||
2. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
|
||||
|
||||
#### SpeedySpeech
|
||||
1. [speedyspeech_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_baker_ckpt_0.4.zip)
|
||||
|
||||
#### TransformerTTS
|
||||
|
||||
1. [transformer_tts_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.3.zip)
|
||||
|
||||
#### Tacotron2
|
||||
|
||||
### Tacotron2
|
||||
1. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
|
||||
2. [tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
|
||||
|
||||
### Tacotron2_AISHELL3
|
||||
1. [tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)
|
||||
### Vocoder
|
||||
|
||||
### TransformerTTS
|
||||
1. [transformer_tts_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.3.zip)
|
||||
#### WaveFlow
|
||||
|
||||
### WaveFlow
|
||||
1. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)
|
||||
|
||||
### GE2E
|
||||
#### Parallel WaveGAN
|
||||
|
||||
1. [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)
|
||||
|
||||
### Voice Cloning
|
||||
|
||||
#### Tacotron2_AISHELL3
|
||||
|
||||
1. [tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)
|
||||
|
||||
#### GE2E
|
||||
|
||||
1. [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
|
||||
|
||||
## Copyright and License
|
||||
## License
|
||||
|
||||
Parakeet is provided under the [Apache-2.0 license](LICENSE).
|
||||
|
|
Before Width: | Height: | Size: 19 KiB After Width: | Height: | Size: 19 KiB |
Before Width: | Height: | Size: 33 KiB After Width: | Height: | Size: 33 KiB |
Before Width: | Height: | Size: 75 KiB After Width: | Height: | Size: 75 KiB |
After Width: | Height: | Size: 116 KiB |
|
@ -11,15 +11,12 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
|
|
|
@ -1,104 +0,0 @@
|
|||
# 实验配置
|
||||
|
||||
本节主要讲述 parakeet 的推荐的配置实验的方式,以及我们做出这样的选择的原因。
|
||||
|
||||
## 配置选项的内容
|
||||
|
||||
深度学习实验常常有很多选项可配置。这些配置大概可以被分为几类:
|
||||
|
||||
1. 数据源以及数据处理方式配置;
|
||||
2. 实验结果保存路径配置;
|
||||
3. 数据预处理方式配置;
|
||||
4. 模型结构和超参数配置;
|
||||
5. 训练过程配置。
|
||||
|
||||
虽然这些配置之间也可能存在某些重叠项,比如数据预处理部分的配置可能就和模型配置有关。比如说 mel 频谱的维数,既可以理解为模型配置的一部分,也可以理解为数据处理配置的一部分。但大体上,配置文件是可以分成几个部分的。
|
||||
|
||||
## 常见配置文件格式
|
||||
|
||||
常见的配置文件的格式有 `ini`, `yaml`, `toml`, `json` 等。
|
||||
|
||||
`ini`
|
||||
优点:简单,支持字符串插值等操作。
|
||||
缺点:仅支持两层结构,值不带类型信息,解析的时候需要手动 cast。
|
||||
|
||||
`yaml`
|
||||
优点:格式简洁,值有类型,解析的时候一般不需手动 cast,支持写注释。
|
||||
缺点:语法规范复杂。
|
||||
|
||||
`toml`
|
||||
和 yaml 类似
|
||||
|
||||
`json`
|
||||
优点:格式简单,
|
||||
缺点:标记符号太多,可读性不佳,手写也容易出错。不支持注释。
|
||||
|
||||
出于语言本身的表达能力和可读性,我们选择 yaml, 但我们会尽可能使配置文件简单。
|
||||
|
||||
1. 类型上,只使用字符串,整数,浮点数,布尔值;
|
||||
2. 结构嵌套上,尽可能只使用两层或更浅的结构。
|
||||
|
||||
## 配置选项和命令行参数处理
|
||||
|
||||
对于深度学习实验,有部分配置是经常会发生改变的,比如数据源以及保存实验结果的路径,或者加载的 checkpoint 的路径等。对于这些配置,更好的做法是把它们实现为命令行参数。
|
||||
|
||||
其余的不经常发生变动的参数,推荐将其写在配置文件中,我们推荐使用 `yaml` 作为配置文件,因为它允许添加注释,并且更加人类可读。
|
||||
|
||||
当然把所有的选项都有 argparse 来处理也可以,但是对于选项丰富的深度学习实验来说,都使用 argparse 会导致代码异常冗长。
|
||||
|
||||
但是需要注意的是,同时使用配置文件和命令行解析工具的时候,如果不做特殊处理,配置文件所支持的选项并不能显示在 argparse.ArgumentParser 的 usage 和 help 信息里。这主要是配置文件解析和 argparse 在设计上的一些固有的差异导致的。
|
||||
|
||||
通过一些手段把配置所支持的选项附加到 ArgumentParser 固然可以弥补这点,但是这会存在一些默认值的优先级哪一方更高的问题,是默认配置的优先级更高,比如还是 ArgumentParser 中的默认值优先级更高。
|
||||
|
||||
因此我们选择不把配置所支持的选项附加到 ArgumentParser,而是分开处理两部分。
|
||||
|
||||
## 实践
|
||||
|
||||
我们选择 yacs 搭配 argparse 作为配置解析工具,为 argparse 命令行新增一个选项 `--config` 来传入配置文件。yacs 有几个特点:
|
||||
|
||||
1. 支持 yaml 格式的配置文件(亦即支持配置层级嵌套以及有类型的值);
|
||||
2. 支持 config 的增量覆盖,以及由命令行参数覆盖配置文件等灵活的操作;
|
||||
3. 支持 `.key` 递归访问属性,比字典式的 `["key"]` 方便;
|
||||
|
||||
我们推荐把默认的配置写成 python 代码(examples 中的每个例子都有一个 config.py,里面提供了默认的配置,并且带有注释)。而如果用户需要覆盖部分配置,则仅需要提供想要覆盖的部分配置即可,而不必提供一个完整的配置文件。这么做的考虑是:
|
||||
|
||||
1. 仅提供需要覆盖的选项也是许多软件配置的标准方式。
|
||||
2. 对于同一个模型的两次实验,往往仅仅只有很少的配置发生变化,仅提供增量的配置比提供完整的配置更容易让用户看出两次实验的配置差异。
|
||||
3. 运行脚本的时候可以不传 `--config` 参数,而以默认配置运行实验,简化运行脚本。
|
||||
|
||||
当新增实验的时候,可以参考 examples 里的例子来写默认配置文件。
|
||||
|
||||
除了可以通过 `--config` 命令行参数来指定用于覆盖的配置文件。另外,我们还可以通过新增一个 `--opts` 选项来接收 ArgumentParser 解析到的剩余命令行参数。这些参数将被用于进一步覆盖配置。使用方式是 `--opts key1 value1 key2 value2 ...`,即以空格分割键和值,比如`--opts training.lr 0.001 model.encoder_layers 4`。其中的键是配置中的键名,对于嵌套的选项,其键名以 `.` 连接。
|
||||
|
||||
## 默认的 ArgumentParser
|
||||
|
||||
我们提供了默认的 ArgumentParser(参考 `parakeet/training/cli.py`), 它实现了上述的功能。它包含极简的命令行选项,只有 `--config`, `--data`, `--output`, `--checkpoint_path`, `--device`, `--nprocs` 和 `--opts` 选项。
|
||||
|
||||
这是一个深度学习基本都需要的一些命令行选项,因此当新增实验的时候,可以直接使用这个 ArgumentParser,当有超出这个范围的命令行选项时,也可以再继续新增。
|
||||
|
||||
1. `--config` 和 `--opts` 用于支持配置文件解析,而配置文件本身处理了每个实验特有的选项;
|
||||
2. `--data` 和 `--output` 分别是数据集的路径和训练结果的保存路径(包含 checkpoints/ 文件夹,文本输出结果以及可视化输出结果);
|
||||
3. `--checkpoint_path` 用于在训练前加载某个 checkpoint, 当需要从某个特定的 checkpoint 加载继续训练。另外,在不传 `--checkpoint_path` 的情况下,如果 `--output` 下的 checkpoints/ 文件夹中包含了训练的结果,则默认会加载其中最新的 checkpoint 继续训练。
|
||||
4. `--device` 和 `--nprocs` 指定了运行方式,`--device` 指定运行设备类型,是在 cpu 还是 gpu 上运行。`--nprocs` 指的是用多少个进程训练,如果 `nprocs` > 1 则意味着使用多进程并行训练。(注:目前只支持 gpu 多卡多进程训练。)
|
||||
|
||||
使用帮助信息如下:
|
||||
|
||||
```text
|
||||
usage: train.py [-h] [--config FILE] [--data DATA_DIR] [--output OUTPUT_DIR]
|
||||
[--checkpoint_path CHECKPOINT_PATH] [--device {cpu,gpu}]
|
||||
[--nprocs NPROCS] [--opts ...]
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config FILE path of the config file to overwrite to default config
|
||||
with.
|
||||
--data DATA_DIR path to the datatset.
|
||||
--output OUTPUT_DIR path to save checkpoint and log. If not provided, a
|
||||
directory is created in runs/ to save outputs.
|
||||
--checkpoint_path CHECKPOINT_PATH
|
||||
path of the checkpoint to load
|
||||
--device {cpu,gpu} device type to use, cpu and gpu are supported.
|
||||
--nprocs NPROCS number of parallel processes to use.
|
||||
--opts ... options to overwrite --config file and the default
|
||||
config, passing in KEY VALUE pairs
|
||||
```
|
|
@ -1,216 +0,0 @@
|
|||
# 数据准备
|
||||
|
||||
本节主要讲述 `parakeet.data` 子模块的设计以及如何在实验中使用它。
|
||||
|
||||
`parakeet.data` 遵循 paddle 管用的数据准备流程。Dataset, Sampler, batch function, DataLoader.
|
||||
|
||||
## Dataset
|
||||
|
||||
我们假设数据集是样例的列表。你可以通过 `__len__` 方法获取其长度,并且可以通过 `__getitem__` 方法随机访问其元素。有了上述两个调节,我们也可以用 `iter(dataset)` 来获得一个 dataset 的迭代器。我们一般通过继承 `paddle.io.Dataset` 来创建自己的数据集。为其实现 `__len__` 方法和 `__getitem__` 方法即可。
|
||||
|
||||
出于数据处理,数据加载和数据集大小等方面的考虑,可以采用集中策略来调控数据集是否被懒惰地预处理,是否被懒惰地被加载,是否常驻内存等。
|
||||
|
||||
1. 数据在数据集实例化的时候被全部预处理并常驻内存。对于数据预处理比较快,且整个数据集较小的情况,可以采用这样的策略。因为整个的数据集的预处理在数据集实例化时完成,因此要求预处理很快,否则将要花时间等待数据集实例化。因为被处理后的数据集常驻内存,因此要求数据集较小,否则可能不能将整个数据集加载进内存。
|
||||
2. 每个样例在被请求的时候预处理,并且把预处理的结果缓存。可以通过在数据集的 `__getitem__` 方法中调用单条样例的预处理方法来实现这个策略。这样做的条件一样是数据可以整个载入内存。但好处是不必花费很多时间等待数据集实例化。使用这个策略,则数据集被完整迭代一次之后,访问样例的时候会显著变快,因为不需要再次处理。但在首次使用的时候仍然会需要即时处理,所以如果快速评估数据迭代的数度还需要等数据集被迭代一遍。
|
||||
3. 先将数据集预处理一遍把结果保存下来。再作为另一个数据集使用,这个新的数据集的 `__getitem__` 方法则只是从存储器读取数据。一般来说数据读取的性能并不会制约模型的训练,并且这也不要求内存必须足以装下整个数据集。是一种较为灵活的方法。但是会需要一个单独的预处理脚本,并且根据处理后的数据写一个数据集。
|
||||
|
||||
以上的三种只是一种概念上的划分,实际使用时候我们可能混用以上的策略。举例如下:
|
||||
|
||||
1. 对于一个样例的多个字段,有的是很小的,比如说文本,可能可能常驻内存;而对于音频,频谱或者图像,可能预先处理并存储,在访问时仅加载处理好的结果。
|
||||
2. 对于某些比较大或者预处理比较慢的数据集。我们可以仅加载一个较小的元数据,里面包含了一些可以用于对样例进行排序或者筛选的特征码,则我们可以在不加载整个样例就可以利用这些元数据对数据进行排序或者筛选。
|
||||
|
||||
一般来说,我们将一个 Dataset 的子类看作是数据集和实验的具体需求之间的适配器。
|
||||
|
||||
parakeet 还提供了若干个高阶的 Dataset 类,用于从已有的 Dataset 产生新的 Dataset.
|
||||
|
||||
1. 用于字段组合的有 TupleDataset, DictDataset;
|
||||
2. 用于数据集切分合并的有 SliceDataset, SubsetDataset, ChainDataset;
|
||||
3. 用于缓存数据集的有 CacheDataset;
|
||||
4. 用于数据集筛选的有 FilterDataset;
|
||||
5. 用于变换数据集的有 TransformDataset.
|
||||
|
||||
可以灵活地使用这些高阶数据集来使数据处理更加灵活。
|
||||
|
||||
## DataLoader
|
||||
|
||||
`DataLoader` 类似 `Dataset` 也是可迭代对象,但是一般情况下,它是按批量来迭代的。在深度学习中我们需要 `DataLoader` 是因为把多个样例组成一个批次可以充分利用现代硬件的计算资源。可以根据一个 Dataset 构建一个 DataLoader,它可以被多次迭代。
|
||||
|
||||
构建 DataLoader 除了需要一个 Dataset 之外,还需要两个要素。
|
||||
|
||||
1. 如何组成批次。
|
||||
2. 如何选取样例来组成批次;
|
||||
|
||||
下面的两个小节将分别提供这两个要素。
|
||||
|
||||
### batch function
|
||||
|
||||
批次是包含多个样例的列表经过某种变换的结果。假设一个样例是一个拥有多个字段的结构(在不同的编程语言可能有不同的实现,比如在 python 中可以是 tuple, dict 等,在 C/C++ 中可能是一个 struct)。那么包含多个样例的列表就是一个结构的阵列(array of structure, AOS). 而出于训练神经网络的需要,我们希望一个批次和一个样例一样,是拥有多个字段的一个结构。因此需要一个方法,把一个结构的阵列(array of structures)变成一个阵列的结构(structure of arrays).
|
||||
|
||||
下面是一个简单的例子:
|
||||
|
||||
下面的表格代表了两个样例,每个包含 5 个字段。
|
||||
|
||||
| weight | height | width | depth | density |
|
||||
| ------ | ------ | ----- | ----- | ------- |
|
||||
| 1.2 | 1.1 | 1.3 | 1.4 | 0.8 |
|
||||
| 1.6 | 1.4 | 1.2 | 0.6 | 1.4 |
|
||||
|
||||
以上表格的 AOS 表示形式和 SOA 表示形式如下:
|
||||
|
||||
AOS:
|
||||
|
||||
```text
|
||||
[(1.2, 1,1, 1,3, 1,4, 0.8),
|
||||
|
||||
(1.6, 1.4, 1.2, 0.6, 1.4)]
|
||||
```
|
||||
|
||||
SOA:
|
||||
|
||||
```text
|
||||
([1,2, 1.6],
|
||||
[1.1, 1.4],
|
||||
[1.3, 1.2],
|
||||
[1.4, 0.6],
|
||||
[0.8, 1.4])
|
||||
```
|
||||
|
||||
对于上述的例子,将 AOS 转换为 SOA 是平凡的。只要把所有样例的各个字段 stack 起来就可以。但事情并非总是如此简单。当一个字段包含一个序列,你可能就需要先把所有的序列都补长 (pad) 到最长的序列长度,然后才能把它们 stack 起来。对于某些情形,批次可能比样例多一些字段,比如说对于包含序列的样例,在补长之后,可能需要增设一个字段来记录那些字段的有效长度。因此,一般情况下,需要一个函数来实现这个功能,而且这是和这个数据集搭配的。当然除了函数之外,也可以使用任何的可调用对象,我们把这些称为 batch function.
|
||||
|
||||
|
||||
### Sampler
|
||||
|
||||
有了 batch function(我们知道如何组成批次), 接下来是另一个问题,将什么组成批次呢?当组建一个批次的时候,我们需要决定选取那些样例来组成它。因此我们预设数据集是可以随机访问的,我们只需要选取对应的索引即可。我们使用 sampler 来完成选取 index 的任务。
|
||||
|
||||
Sampler 被实现为产生整数的可迭代对象。假设数据集有 `N` 个样例,那么产生 `[0, N)` 之间的整数的迭代器就是一个合适的迭代器。最常用的 sampler 是 `SequentialSampler` 和 `RandomSampler`.
|
||||
|
||||
当迭代一个 DataLoader 的时候,首先 sampler 产生多个 index, 然后根据这些 index 去取出对应的样例,并调用 batch function 把这些样例组成一个批次。当然取出样例的过程是可并行的,但调用 batch function 组成 batch 不是。
|
||||
|
||||
另外的一种选择是使用 batch sampler, 它是产生整数列表的可迭代对象。对于一般的 sampler, 需要对其迭代器使用 next 多次才能产出多个 index, 而对于 batch sampler, 对其迭代器使用 next 一次就可以产出多个 index. 对于使用一般的 sampler 的情形,batch size 由 DataLoader 的来决定。而对于 batch sampler, 则是由它决定了 DataLoader 的 batch size, 因此可以用它来实现一些特别的需求,比如说动态 batch size.
|
||||
|
||||
## 示例代码
|
||||
|
||||
以下是我们使用 `parakeet.data` 处理 `LJSpeech` 数据集的代码。
|
||||
|
||||
首先,我们定义一个 class 来代表 LJspeech 数据集,它只是如其所是地加载了元数据,亦即数据集中的 `metadata.csv` 文件,其中记录了音频文件的文件名,以及转录文本。但并不加载音频,也并不做任何的预处理。我们有意让这个数据集保持简单,它仅需要数据集的路径来实例化。
|
||||
|
||||
```python
|
||||
import csv
|
||||
import numpy as np
|
||||
import librosa
|
||||
from pathlib import Path
|
||||
from paddle.io import Dataset
|
||||
|
||||
from parakeet.data import batch_spec, batch_wav
|
||||
|
||||
class LJSpeechMetaData(Dataset):
|
||||
def __init__(self, root):
|
||||
self.root = Path(root).expanduser()
|
||||
wav_dir = self.root / "wavs"
|
||||
csv_path = self.root / "metadata.csv"
|
||||
records = []
|
||||
speaker_name = "ljspeech"
|
||||
with open(str(csv_path), 'rt') as f:
|
||||
for line in f:
|
||||
filename, _, normalized_text = line.strip().split("|")
|
||||
filename = str(wav_dir / (filename + ".wav"))
|
||||
records.append([filename, normalized_text, speaker_name])
|
||||
self.records = records
|
||||
|
||||
def __getitem__(self, i):
|
||||
return self.records[i]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.records)
|
||||
```
|
||||
|
||||
然后我们定义一个 `Transform` 类,用于处理 `LJSpeechMetaData` 中的样例,将其转换为模型所需要的数据。对于不同的模型可以定义不同的 Transform,这样就可以共用 `LJSpeechMetaData` 的代码。
|
||||
|
||||
```python
|
||||
from parakeet.audio import AudioProcessor
|
||||
from parakeet.audio import LogMagnitude
|
||||
from parakeet.frontend import English
|
||||
|
||||
class Transform(object):
|
||||
def __init__(self):
|
||||
self.frontend = English()
|
||||
self.processor = AudioProcessor(
|
||||
sample_rate=22050,
|
||||
n_fft=1024,
|
||||
win_length=1024,
|
||||
hop_length=256,
|
||||
f_max=8000)
|
||||
self.normalizer = LogMagnitude()
|
||||
|
||||
def forward(self, record):
|
||||
fname, text, _ = meta_data:
|
||||
wav = processor.read_wav(fname)
|
||||
mel = processor.mel_spectrogram(wav)
|
||||
mel = normalizer.transform(mel)
|
||||
phonemes = frontend.phoneticize(text)
|
||||
ids = frontend.numericalize(phonemes)
|
||||
mel_name = os.path.splitext(os.path.basename(fname))[0]
|
||||
stop_probs = np.ones([mel.shape[1]], dtype=np.int64)
|
||||
stop_probs[-1] = 2
|
||||
return (ids, mel, stop_probs)
|
||||
```
|
||||
|
||||
`Transform` 加载音频,并且提取频谱。把 `Transform` 实现为一个可调用的类可以方便地持有许多选项,比如和傅里叶变换相关的参数。这里可以把一个 `LJSpeechMetaData` 对象和一个 `Transform` 对象组合起来,创建一个 `TransformDataset`.
|
||||
|
||||
```python
|
||||
from parakeet.data import TransformDataset
|
||||
|
||||
meta = LJSpeechMetaData(data_path)
|
||||
transform = Transform()
|
||||
ljspeech = TransformDataset(meta, transform)
|
||||
```
|
||||
|
||||
当然也可以选择专门写一个转换脚本把转换后的数据集保存下来,然后再写一个适配的 Dataset 子类去加载这些保存的数据。实际这么做的效率会更高。
|
||||
|
||||
接下来我们需要写一个可调用对象将多个样例组成批次。因为其中的 ids 和 mel 频谱是序列数据,所以我们需要进行 padding.
|
||||
|
||||
```python
|
||||
class LJSpeechCollector(object):
|
||||
"""A simple callable to batch LJSpeech examples."""
|
||||
def __init__(self, padding_idx=0, padding_value=0.):
|
||||
self.padding_idx = padding_idx
|
||||
self.padding_value = padding_value
|
||||
|
||||
def __call__(self, examples):
|
||||
ids = [example[0] for example in examples]
|
||||
mels = [example[1] for example in examples]
|
||||
stop_probs = [example[2] for example in examples]
|
||||
|
||||
ids = batch_text_id(ids, pad_id=self.padding_idx)
|
||||
mels = batch_spec(mels, pad_value=self.padding_value)
|
||||
stop_probs = batch_text_id(stop_probs, pad_id=self.padding_idx)
|
||||
return ids, np.transpose(mels, [0, 2, 1]), stop_probs
|
||||
```
|
||||
|
||||
以上的组件准备就绪后,可以准备整个数据流。
|
||||
|
||||
```python
|
||||
def create_dataloader(source_path, valid_size, batch_size):
|
||||
lj = LJSpeechMeta(source_path)
|
||||
transform = Transform()
|
||||
lj = TransformDataset(lj, transform)
|
||||
|
||||
valid_set, train_set = dataset.split(lj, valid_size)
|
||||
train_loader = DataLoader(
|
||||
train_set,
|
||||
return_list=False,
|
||||
batch_size=batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=LJSpeechCollector())
|
||||
valid_loader = DataLoader(
|
||||
valid_set,
|
||||
return_list=False,
|
||||
batch_size=batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=LJSpeechCollector())
|
||||
return train_loader, valid_loader
|
||||
```
|
||||
|
||||
train_loader 和 valid_loader 可以被迭代。对其迭代器使用 next, 返回的是 `paddle.Tensor` 的 list, 代表一个 batch,这些就可以直接用作 `paddle.nn.Layer` 的输入了。
|
|
@ -1,75 +0,0 @@
|
|||
# 实验流程
|
||||
|
||||
实验中有不少细节需要注意,比如模型的保存和加载,定期进行验证,文本 log 和 可视化 log,保存配置文件等,另外对于不同的运行方式还有额外的处理,这些代码可能比较繁琐,但是对于追踪代码变化对结果的影响以及 debug 都非常重要。为了减少写这部分代码的成本,我们提供了不少通用的辅助代码,比如用于保存和加载,以及可视化的代码,可供实验代码直接使用。
|
||||
|
||||
而对于整个实验过程,我们提供了一个 ExperimentBase 类,它是在模型和实验开发的过程抽象出来的训练过程模板,可以作为具体实验的基类使用。相比 chainer 中的 Trainer 以及 keras 中的 Model.fit 而言,ExperimentBase 是一个相对低层级的 API。它是作为基类来使用,用户仍然需要实现整个训练过程,也因此可以自由控制许多东西;而不是作为一种组合方式来使用,用户只需要提供模型,数据集,评价指标等就能自动完成整个训练过程。
|
||||
|
||||
前者的方式并不能节省很多代码量,只是以一种标准化的方式来组织代码。后者的方式虽然能够节省许多代码量,但是把如何组成整个训练过程的方式对用户隐藏了。如果需要为标准的训练过程添加一些自定义行为,则必须通过 extension/hook 等方式来实现,在一些固定的时点加入一些自定义行为(比如 iteration 开始、结束时,epoch 开始、结束时,整个训练流程开始、结束时)。
|
||||
|
||||
通过 extension/hook 之类的方式来为训练流程加入自定义行为,往往存在一些 access 的限制。extension/hook 一般是通过 callable 的形式来实现,但是这个 callable 可访问的变量往往是有限的,比如说只能访问 model, optimzier, dataloader, iteration, epoch, metric 等,如果需要访问其他的中间变量,则往往比较麻烦。
|
||||
|
||||
此外,组合式的使用方式往往对几个组件之间传输数据的协议有一些预设。一个常见的预设是:dataloader 产生的 batch 即是 model 的输入。在简单的情况下,这样大抵是没有问题的,但是也存在一些可能,模型需要除了 batch 之外的输入。令一个常见的预设是:criterion 仅需要 model 的 input 和 output 就能计算 loss, 但这么做其实存在 overkill 的可能,某些情况下,不需要 input 和 output 的全部字段就能计算 loss,如果为了满足协议而把 criterion 的接口设计成一样的,存在输出不必要的参数的问题。
|
||||
|
||||
## ExperimentBase 的设计
|
||||
|
||||
因此我们选择了低层次的接口,用户仍然可以自由操作训练过程,而只是对训练过程做了粗粒度的抽象。可以参考 [ExperimentBase](parakeet/training/experiment.py) 的代码。
|
||||
|
||||
继承 ExperimentBase 写作自己的实验类的时候,需要遵循一下的一些规范:
|
||||
|
||||
1. 包含 `.model`, `.optimizer`, `.train_loader`, `.valid_loader`, `.config`, `.args` 等属性。
|
||||
2. 配置需要包含一个 `.training` 字段, 其中包含 `valid_interval`, `save_interval` 和 `max_iteration` 几个键. 它们被用作触发验证,保存 checkpoint 以及停止训练的条件。
|
||||
3. 需要实现四个方法 `train_batch`, `valid`, `setup_model` and `setup_dataloader`。`train_batch` 是在一个 batch 的过程,`valid` 是在整个验证数据集上执行一次验证的过程,`setup_model` 是初始化 model 和 optimizer 的过程,其他的模型构建相关的代码也可以放在这里,`setup_dataloader` 是 train_loader 和 valid_loader 的构建过程。
|
||||
|
||||
实验的初始化过程如下, 包含了创建模型,优化器,数据迭代器,准备输出目录,logger 和可视化,保存配置的工作,除了 `setup_dataloader` 和 `self.setup_model` 需要自行实现,其他的几个方法都已有标准的实现。
|
||||
|
||||
```python
|
||||
def __init__(self, config, args):
|
||||
self.config = config
|
||||
self.args = args
|
||||
|
||||
def setup(self):
|
||||
paddle.set_device(self.args.device)
|
||||
if self.parallel:
|
||||
self.init_parallel()
|
||||
|
||||
self.setup_output_dir()
|
||||
self.dump_config()
|
||||
self.setup_visualizer()
|
||||
self.setup_logger()
|
||||
self.setup_checkpointer()
|
||||
|
||||
self.setup_dataloader()
|
||||
self.setup_model()
|
||||
|
||||
self.iteration = 0
|
||||
self.epoch = 0
|
||||
```
|
||||
|
||||
使用的时候只要一下的代码即可配置好一次实验:
|
||||
|
||||
```python
|
||||
exp = Experiment(config, args)
|
||||
exp.setup()
|
||||
```
|
||||
|
||||
整个训练流程可以表示如下:
|
||||
|
||||
```python
|
||||
def train(self):
|
||||
self.new_epoch()
|
||||
while self.iteration < self.config.training.max_iteration:
|
||||
self.iteration += 1
|
||||
self.train_batch()
|
||||
|
||||
if self.iteration % self.config.training.valid_interval == 0:
|
||||
self.valid()
|
||||
|
||||
if self.iteration % self.config.training.save_interval == 0:
|
||||
self.save()
|
||||
```
|
||||
|
||||
使用时只需要执行如下代码即可开始实验。
|
||||
|
||||
```python
|
||||
exp.run()
|
||||
```
|
|
@ -1,74 +0,0 @@
|
|||
# 如何准备自己的实验
|
||||
|
||||
对于一般的深度学习实验,有几个部分需要处理。
|
||||
|
||||
1. 按照模型的需要对数据进行预处理,并且按批次迭代数据集;
|
||||
2. 定义模型以及优化器等组件;
|
||||
3. 写出训练过程(一般包括 forward/backward 计算,参数更新,log 记录,可视化,定期评估等步骤);
|
||||
4. 配置并运行实验。
|
||||
|
||||
## 数据处理
|
||||
|
||||
对于数据处理,`parakeet.data` 采用了 paddlepaddle 常用的 `Dataset -> DataLoader` 的流程。数据处理流程的概览如下:
|
||||
|
||||
```text
|
||||
Dataset --(transform)--> Dataset --+
|
||||
sampler --+
|
||||
batch_fn --+-> DataLoader
|
||||
```
|
||||
|
||||
其中 transform 代表的是对样例的预处理。可以使用 `parakeet.data` 中的 TransformDataset 来从一个 Dataset 构建另一个 Dataset.
|
||||
|
||||
得到想要的 Dataset 之后,提供 sampler 和 batch function, 即可据此构建 DataLoader. DataLoader 产生的结果可以直接用作模型的输入。
|
||||
|
||||
详细的使用方式参见 [data_cn](./data_cn.md).
|
||||
|
||||
## 模型
|
||||
|
||||
为了对模型的可复用行和功能做较好的平衡,我们把模型按照其特征分为几种。
|
||||
|
||||
对于较为常用,可以作为其他更大的模型的部分的模块,我们尽可能将其实现得足够简单和通用,因为它们会被复用。对于含有可训练参数的模块,一般实现为 `paddle.nn.Layer` 的子类,但它们不是直接面向一个任务,因此不会带上处理未加工的输入和输出的功能。对于不含有可训练参数的模块,可以直接实现为一个函数,其输入输出都是 `paddle.Tensor` 或其集合。
|
||||
|
||||
针对一个特定任务的开箱模型,一般实现为 `paddle.nn.Layer` 的子类,是一个任务的核心计算单元。为了方便地处理输入和输出,一般还可以为它添加处理未加工的输入输出的功能。比如对于 NLP 任务来说,尽管神经网络接受的输出是文本的 id, 但是为了使模型能够处理未加工的输入,文本预处理的功能,以及文本转 id 的字典,也都应该视作模型的一部分。
|
||||
|
||||
当一个模型足够复杂,对其进行模块化切分是更好的选择,尽管拆分出来的小模块的功能也不一定非常通用,可能只是用于某个模型,但是当作么做有利于代码的清晰简洁时,仍然推荐这么做。
|
||||
|
||||
在 parakeet 的目录结构中,复用性较高的模块被放在 [parakeet.modules](../parakeet/modules/), 但是针对特定任务的模型则放在 [parakeet.models](../parakeet/models).
|
||||
|
||||
当开发新的模型的时候,开发这需要考虑拆分模块的可行性,以及模块的通用程度,把它们分置于合适的目录。
|
||||
|
||||
## 配置实验
|
||||
|
||||
我们使用 yacs 和 argparse 分别处理配置文件解析和命令行参数解析。关于配置的推荐方式,参考 [实验配置](./config_cn.md).
|
||||
|
||||
## 训练流程
|
||||
|
||||
训练流程一般就是多次训练一个循环体。典型的循环体包含如下的过程:
|
||||
|
||||
1. 迭代数据集;
|
||||
2. 处理批次数据;
|
||||
3. 神经网络的 forward/backward 计算;
|
||||
4. 参数更新;
|
||||
5. 符合一定条件时,在验证数据集上评估模型;
|
||||
6. 写日志,可视化,以及在某些情况下保存必要的中间结果;
|
||||
7. 保存模型和优化器的状态。
|
||||
|
||||
`数据处理` 包含了数据集以及 batch_function 的定义, 模型和优化器包含了模型的 forward/backward 计算的定义。而在模型和数据都准备好了,我们需要把这些组织起来,完成实验代码。
|
||||
|
||||
训练流程的组装,可以参考 [实验流程](./experiment_cn.md).
|
||||
|
||||
## 实验模板
|
||||
|
||||
实验代码一般以如下的方式组织:
|
||||
|
||||
```text
|
||||
├── README.md (实验的帮助信息)
|
||||
├── config.py (默认配置)
|
||||
├── preprocess.py (数据预处理脚本)
|
||||
├── data.py (Dataset, batch_function 等的定义)
|
||||
├── synthesis.py (用于生成的代码)
|
||||
├── train.py (用于训练的代码)
|
||||
└── utils.py (其他必要的辅助函数)
|
||||
```
|
||||
|
||||
在这个软件源中包含了几个例子,可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户,可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。
|
|
@ -1,63 +0,0 @@
|
|||
=============
|
||||
安装
|
||||
=============
|
||||
|
||||
|
||||
安装 PaddlePaddle
|
||||
-------------------
|
||||
Parakeet 以 PaddlePaddle 作为其后端,因此依赖 PaddlePaddle,值得说明的是 Parakeet 要求 2.0 及以上版本的 PaddlePaddle。你可以通过 pip 安装。如果需要安装支持 gpu 版本的 PaddlePaddle,需要根据环境中的 cuda 和 cudnn 的版本来选择 wheel 包的版本。使用 conda 安装以及源码编译安装的方式请参考 `PaddlePaddle 快速安装 <https://www.paddlepaddle.org.cn/install/quick/)>`_.
|
||||
|
||||
**gpu 版 PaddlePaddle**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python -m pip install paddlepaddle-gpu==2.0.0rc1.post101 -f https://paddlepaddle.org.cn/whl/stable.html
|
||||
python -m pip install paddlepaddle-gpu==2.0.0rc1.post100 -f https://paddlepaddle.org.cn/whl/stable.html
|
||||
|
||||
|
||||
**cpu 版 PaddlePaddle**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python -m pip install paddlepaddle==2.0.0rc1 -i https://mirror.baidu.com/pypi/simple
|
||||
|
||||
|
||||
安装 libsndfile
|
||||
-------------------
|
||||
|
||||
因为 Parakeet 的实验中常常会需要用到和音频处理,以及频谱处理相关的功能,所以我们依赖 librosa 和 soundfile 进行音频处理。而 librosa 和 soundfile 依赖一个 C 的库 libsndfile, 因为这不是 python 的包,对于 windows 用户和 mac 用户,使用 pip 安装 soundfile 的时候,libsndfile 也会被安装。如果遇到问题也可以参考 `SoundFile <https://pypi.org/project/SoundFile>`_.
|
||||
|
||||
对于 linux 用户,需要使用系统的包管理器安装这个包,常见发行版上的命令参考如下。
|
||||
|
||||
|
||||
.. code-block::
|
||||
|
||||
# ubuntu, debian
|
||||
sudo apt-get install libsndfile1
|
||||
|
||||
# centos, fedora,
|
||||
sudo yum install libsndfile
|
||||
|
||||
# openSUSE
|
||||
sudo zypper in libsndfile
|
||||
|
||||
|
||||
安装 Parakeet
|
||||
------------------
|
||||
|
||||
我们提供两种方式来使用 Parakeet.
|
||||
|
||||
#. 需要运行 Parakeet 自带的实验代码,或者希望进行二次开发的用户,可以先从 github 克隆本工程,cd 仅工程目录,并进行可编辑式安装(不会被复制到 site-packages, 而且对工程的修改会立即生效,不需要重新安装),之后就可以使用了。
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# -e 表示可编辑式安装
|
||||
pip install -e .
|
||||
|
||||
|
||||
#. 仅需要使用我们提供的训练好的模型进行预测,那么也可以直接安装 pypi 上的 wheel 包的版本。
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install paddle-parakeet
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
# Parakeet 概览
|
||||
|
||||
<img src="../images/logo.png" alt="parakeet-logo" style="zoom: 33%;" />
|
||||
|
||||
Parakeet 旨在为开源社区提供一个灵活,高效,先进的语音合成工具箱。Parakeet 基于PaddlePaddle 2.0 构建,并且包含了百度研究院以及其他研究机构的许多有影响力的 TTS 模型。
|
||||
|
||||
Parakeet 为用户和开发者提供了
|
||||
|
||||
1. 可复用的模型以及常用的模块;
|
||||
2. 从数据处理,模型训练到预测等一系列过程的完整实验;
|
||||
3. 高质量的开箱即用模型。
|
|
@ -0,0 +1,80 @@
|
|||
# FastSpeech2 with AISHELL-3
|
||||
This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2006.04558) model with [AISHELL-3](http://www.aishelltech.com/aishell_3).
|
||||
|
||||
## Introduction
|
||||
AISHELL-3 is a large-scale and high-fidelity multi-speaker Mandarin speech corpus which could be used to train multi-speaker Text-to-Speech (TTS) systems.
|
||||
We use AISHELL-3 to train a multi-speaker fastspeech2 model here.
|
||||
|
||||
## Dataset
|
||||
|
||||
### Download and Extract the datasaet
|
||||
Download AISHELL-3.
|
||||
```bash
|
||||
wget https://www.openslr.org/resources/93/data_aishell3.tgz
|
||||
```
|
||||
Extract AISHELL-3.
|
||||
```bash
|
||||
mkdir data_aishell3
|
||||
tar zxvf data_aishell3.tgz -C data_aishell3
|
||||
```
|
||||
|
||||
### Get MFA result of BZNSYP and Extract it
|
||||
We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
|
||||
You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) (use MFA1.x now) of our repo.
|
||||
|
||||
### Preprocess the dataset
|
||||
Assume the path to the dataset is `~/datasets/data_aishell3`.
|
||||
Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
|
||||
Run the command below to preprocess the dataset.
|
||||
|
||||
```bash
|
||||
./preprocess.sh
|
||||
```
|
||||
|
||||
## Train the model
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
If you want to train fastspeech2 with cpu, please add `--device=cpu` arguments for `python3 train.py` in `run.sh`.
|
||||
|
||||
## Synthesize
|
||||
We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.
|
||||
Download pretrained parallel wavegan model (Trained with baker) from [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip) and unzip it.
|
||||
```bash
|
||||
unzip parallel_wavegan_baker_ckpt_0.4.zip
|
||||
```
|
||||
`synthesize.sh` can synthesize waveform from `metadata.jsonl`.
|
||||
`synthesize_e2e.sh` can synthesize waveform from text list.
|
||||
|
||||
```bash
|
||||
./synthesize.sh
|
||||
```
|
||||
or
|
||||
```bash
|
||||
./synthesize_e2e.sh
|
||||
```
|
||||
|
||||
You can see the bash files for more datails of input parameters.
|
||||
|
||||
## Pretrained Model
|
||||
Pretrained Model with no sil in the edge of audios can be downloaded here. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
|
||||
|
||||
Then, you can use the following scripts to synthesize for `../sentences.txt` using pretrained fastspeech2 model.
|
||||
```bash
|
||||
python3 synthesize_e2e.py \
|
||||
--fastspeech2-config=fastspeech2_nosil_aishell3_ckpt_0.4/default.yaml \
|
||||
--fastspeech2-checkpoint=fastspeech2_nosil_aishell3_ckpt_0.4/snapshot_iter_96400.pdz \
|
||||
--fastspeech2-stat=fastspeech2_nosil_aishell3_ckpt_0.4/speech_stats.npy \
|
||||
--pwg-config=parallel_wavegan_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--pwg-params=parallel_wavegan_baker_ckpt_0.4/pwg_generator.pdparams \
|
||||
--pwg-stat=parallel_wavegan_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--text=../sentences.txt \
|
||||
--output-dir=exp/default/test_e2e \
|
||||
--device="gpu" \
|
||||
--phones-dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \
|
||||
--speaker-dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt
|
||||
|
||||
```
|
||||
|
||||
## Future work
|
||||
A multi-speaker vocoder is needed.
|
|
@ -0,0 +1,59 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
from parakeet.data.batch import batch_sequences
|
||||
|
||||
|
||||
def collate_aishell3_examples(examples):
|
||||
# fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"]
|
||||
text = [np.array(item["text"], dtype=np.int64) for item in examples]
|
||||
speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
|
||||
pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
|
||||
energy = [np.array(item["energy"], dtype=np.float32) for item in examples]
|
||||
durations = [
|
||||
np.array(item["durations"], dtype=np.int64) for item in examples
|
||||
]
|
||||
text_lengths = np.array([item["text_lengths"] for item in examples])
|
||||
speech_lengths = np.array([item["speech_lengths"] for item in examples])
|
||||
spk_id = np.array([item["spk_id"] for item in examples])
|
||||
|
||||
text = batch_sequences(text)
|
||||
pitch = batch_sequences(pitch)
|
||||
speech = batch_sequences(speech)
|
||||
durations = batch_sequences(durations)
|
||||
energy = batch_sequences(energy)
|
||||
|
||||
# convert each batch to paddle.Tensor
|
||||
text = paddle.to_tensor(text)
|
||||
pitch = paddle.to_tensor(pitch)
|
||||
speech = paddle.to_tensor(speech)
|
||||
durations = paddle.to_tensor(durations)
|
||||
energy = paddle.to_tensor(energy)
|
||||
text_lengths = paddle.to_tensor(text_lengths)
|
||||
speech_lengths = paddle.to_tensor(speech_lengths)
|
||||
spk_id = paddle.to_tensor(spk_id)
|
||||
|
||||
batch = {
|
||||
"text": text,
|
||||
"text_lengths": text_lengths,
|
||||
"durations": durations,
|
||||
"speech": speech,
|
||||
"speech_lengths": speech_lengths,
|
||||
"pitch": pitch,
|
||||
"energy": energy,
|
||||
"spk_id": spk_id
|
||||
}
|
||||
return batch
|
|
@ -0,0 +1,106 @@
|
|||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
|
||||
fs: 24000 # sr
|
||||
n_fft: 2048 # FFT size.
|
||||
n_shift: 300 # Hop size.
|
||||
win_length: 1200 # Window length.
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
|
||||
# Only used for feats_type != raw
|
||||
|
||||
fmin: 80 # Minimum frequency of Mel basis.
|
||||
fmax: 7600 # Maximum frequency of Mel basis.
|
||||
n_mels: 80 # The number of mel basis.
|
||||
|
||||
# Only used for the model using pitch features (e.g. FastSpeech2)
|
||||
f0min: 80 # Maximum f0 for pitch extraction.
|
||||
f0max: 400 # Minimum f0 for pitch extraction.
|
||||
|
||||
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
batch_size: 64
|
||||
num_workers: 4
|
||||
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model:
|
||||
adim: 384 # attention dimension
|
||||
aheads: 2 # number of attention heads
|
||||
elayers: 4 # number of encoder layers
|
||||
eunits: 1536 # number of encoder ff units
|
||||
dlayers: 4 # number of decoder layers
|
||||
dunits: 1536 # number of decoder ff units
|
||||
positionwise_layer_type: conv1d # type of position-wise layer
|
||||
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
|
||||
duration_predictor_layers: 2 # number of layers of duration predictor
|
||||
duration_predictor_chans: 256 # number of channels of duration predictor
|
||||
duration_predictor_kernel_size: 3 # filter size of duration predictor
|
||||
postnet_layers: 5 # number of layers of postnset
|
||||
postnet_filts: 5 # filter size of conv layers in postnet
|
||||
postnet_chans: 256 # number of channels of conv layers in postnet
|
||||
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||
use_scaled_pos_enc: True # whether to use scaled positional encoding
|
||||
encoder_normalize_before: True # whether to perform layer normalization before the input
|
||||
decoder_normalize_before: True # whether to perform layer normalization before the input
|
||||
reduction_factor: 1 # reduction factor
|
||||
init_type: xavier_uniform # initialization type
|
||||
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
|
||||
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
|
||||
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
|
||||
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
|
||||
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
|
||||
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
|
||||
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
|
||||
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
|
||||
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
|
||||
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
|
||||
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
|
||||
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
|
||||
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
|
||||
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
|
||||
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
|
||||
energy_predictor_layers: 2 # number of conv layers in energy predictor
|
||||
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
|
||||
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
|
||||
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
|
||||
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
|
||||
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
|
||||
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
|
||||
spk_embed_dim: 256 # speaker embedding dimension
|
||||
spk_embed_integration_type: concat # speaker embedding integration type
|
||||
|
||||
|
||||
|
||||
###########################################################
|
||||
# UPDATER SETTING #
|
||||
###########################################################
|
||||
updater:
|
||||
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||
|
||||
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
optimizer:
|
||||
optim: adam # optimizer type
|
||||
learning_rate: 0.001 # learning rate
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 200
|
||||
num_snapshots: 5
|
||||
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
seed: 10086
|
|
@ -0,0 +1,28 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from yacs.config import CfgNode as Configuration
|
||||
|
||||
config_path = (Path(__file__).parent / "conf" / "default.yaml").resolve()
|
||||
|
||||
with open(config_path, 'rt') as f:
|
||||
_C = yaml.safe_load(f)
|
||||
_C = Configuration(_C)
|
||||
|
||||
|
||||
def get_cfg_default():
|
||||
config = _C.clone()
|
||||
return config
|
|
@ -0,0 +1,161 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import logging
|
||||
|
||||
from paddle import distributed as dist
|
||||
from parakeet.models.fastspeech2 import FastSpeech2Loss
|
||||
from parakeet.training.extensions.evaluator import StandardEvaluator
|
||||
from parakeet.training.reporter import report
|
||||
from parakeet.training.updaters.standard_updater import StandardUpdater
|
||||
logging.basicConfig(
|
||||
format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
|
||||
datefmt='[%Y-%m-%d %H:%M:%S]')
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class FastSpeech2Updater(StandardUpdater):
|
||||
def __init__(self,
|
||||
model,
|
||||
optimizer,
|
||||
dataloader,
|
||||
init_state=None,
|
||||
use_masking=False,
|
||||
use_weighted_masking=False,
|
||||
output_dir=None):
|
||||
super().__init__(model, optimizer, dataloader, init_state=None)
|
||||
self.use_masking = use_masking
|
||||
self.use_weighted_masking = use_weighted_masking
|
||||
|
||||
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
|
||||
self.filehandler = logging.FileHandler(str(log_file))
|
||||
logger.addHandler(self.filehandler)
|
||||
self.logger = logger
|
||||
self.msg = ""
|
||||
|
||||
def update_core(self, batch):
|
||||
self.msg = "Rank: {}, ".format(dist.get_rank())
|
||||
losses_dict = {}
|
||||
|
||||
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
|
||||
text=batch["text"],
|
||||
text_lengths=batch["text_lengths"],
|
||||
speech=batch["speech"],
|
||||
speech_lengths=batch["speech_lengths"],
|
||||
durations=batch["durations"],
|
||||
pitch=batch["pitch"],
|
||||
energy=batch["energy"],
|
||||
spk_id=batch["spk_id"], )
|
||||
|
||||
criterion = FastSpeech2Loss(
|
||||
use_masking=self.use_masking,
|
||||
use_weighted_masking=self.use_weighted_masking)
|
||||
|
||||
l1_loss, duration_loss, pitch_loss, energy_loss = criterion(
|
||||
after_outs=after_outs,
|
||||
before_outs=before_outs,
|
||||
d_outs=d_outs,
|
||||
p_outs=p_outs,
|
||||
e_outs=e_outs,
|
||||
ys=ys,
|
||||
ds=batch["durations"],
|
||||
ps=batch["pitch"],
|
||||
es=batch["energy"],
|
||||
ilens=batch["text_lengths"],
|
||||
olens=olens)
|
||||
|
||||
loss = l1_loss + duration_loss + pitch_loss + energy_loss
|
||||
|
||||
optimizer = self.optimizer
|
||||
optimizer.clear_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
report("train/loss", float(loss))
|
||||
report("train/l1_loss", float(l1_loss))
|
||||
report("train/duration_loss", float(duration_loss))
|
||||
report("train/pitch_loss", float(pitch_loss))
|
||||
report("train/energy_loss", float(energy_loss))
|
||||
|
||||
losses_dict["l1_loss"] = float(l1_loss)
|
||||
losses_dict["duration_loss"] = float(duration_loss)
|
||||
losses_dict["pitch_loss"] = float(pitch_loss)
|
||||
losses_dict["energy_loss"] = float(energy_loss)
|
||||
losses_dict["loss"] = float(loss)
|
||||
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_dict.items())
|
||||
|
||||
|
||||
class FastSpeech2Evaluator(StandardEvaluator):
|
||||
def __init__(self,
|
||||
model,
|
||||
dataloader,
|
||||
use_masking=False,
|
||||
use_weighted_masking=False,
|
||||
output_dir=None):
|
||||
super().__init__(model, dataloader)
|
||||
self.use_masking = use_masking
|
||||
self.use_weighted_masking = use_weighted_masking
|
||||
|
||||
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
|
||||
self.filehandler = logging.FileHandler(str(log_file))
|
||||
logger.addHandler(self.filehandler)
|
||||
self.logger = logger
|
||||
self.msg = ""
|
||||
|
||||
def evaluate_core(self, batch):
|
||||
self.msg = "Evaluate: "
|
||||
losses_dict = {}
|
||||
|
||||
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
|
||||
text=batch["text"],
|
||||
text_lengths=batch["text_lengths"],
|
||||
speech=batch["speech"],
|
||||
speech_lengths=batch["speech_lengths"],
|
||||
durations=batch["durations"],
|
||||
pitch=batch["pitch"],
|
||||
energy=batch["energy"],
|
||||
spk_id=batch["spk_id"], )
|
||||
|
||||
criterion = FastSpeech2Loss(
|
||||
use_masking=self.use_masking,
|
||||
use_weighted_masking=self.use_weighted_masking)
|
||||
l1_loss, duration_loss, pitch_loss, energy_loss = criterion(
|
||||
after_outs=after_outs,
|
||||
before_outs=before_outs,
|
||||
d_outs=d_outs,
|
||||
p_outs=p_outs,
|
||||
e_outs=e_outs,
|
||||
ys=ys,
|
||||
ds=batch["durations"],
|
||||
ps=batch["pitch"],
|
||||
es=batch["energy"],
|
||||
ilens=batch["text_lengths"],
|
||||
olens=olens, )
|
||||
loss = l1_loss + duration_loss + pitch_loss + energy_loss
|
||||
|
||||
report("eval/loss", float(loss))
|
||||
report("eval/l1_loss", float(l1_loss))
|
||||
report("eval/duration_loss", float(duration_loss))
|
||||
report("eval/pitch_loss", float(pitch_loss))
|
||||
report("eval/energy_loss", float(energy_loss))
|
||||
|
||||
losses_dict["l1_loss"] = float(l1_loss)
|
||||
losses_dict["duration_loss"] = float(duration_loss)
|
||||
losses_dict["pitch_loss"] = float(pitch_loss)
|
||||
losses_dict["energy_loss"] = float(energy_loss)
|
||||
losses_dict["loss"] = float(loss)
|
||||
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_dict.items())
|
||||
self.logger.info(self.msg)
|
|
@ -0,0 +1,78 @@
|
|||
#!/bin/bash
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
fs=24000
|
||||
n_shift=300
|
||||
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# get durations from MFA's result
|
||||
echo "Generate durations.txt from MFA results ..."
|
||||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
|
||||
--inputdir=./aishell3_alignment_tone \
|
||||
--output durations.txt \
|
||||
--sample-rate=${fs} \
|
||||
--n-shift=${n_shift}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# extract features
|
||||
echo "Extract features ..."
|
||||
python3 ${MAIN_ROOT}/utils/fastspeech2_preprocess.py \
|
||||
--dataset=aishell3 \
|
||||
--rootdir=~/datasets/data_aishell3/ \
|
||||
--dumpdir=dump \
|
||||
--dur-file=durations.txt \
|
||||
--config-path=conf/default.yaml \
|
||||
--num-cpu=8 \
|
||||
--cut-sil=True
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="speech"
|
||||
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="pitch"
|
||||
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="energy"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# normalize and covert phone to id, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
python3 ${MAIN_ROOT}/utils/fastspeech2_normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--pitch-stats=dump/train/pitch_stats.npy \
|
||||
--energy-stats=dump/train/energy_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${MAIN_ROOT}/utils/fastspeech2_normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--pitch-stats=dump/train/pitch_stats.npy \
|
||||
--energy-stats=dump/train/energy_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${MAIN_ROOT}/utils/fastspeech2_normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--pitch-stats=dump/train/pitch_stats.npy \
|
||||
--energy-stats=dump/train/energy_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
fi
|
|
@ -0,0 +1,160 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
import paddle
|
||||
import soundfile as sf
|
||||
import yaml
|
||||
from yacs.config import CfgNode
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.models.fastspeech2 import FastSpeech2
|
||||
from parakeet.models.fastspeech2 import FastSpeech2Inference
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator
|
||||
from parakeet.models.parallel_wavegan import PWGInference
|
||||
from parakeet.modules.normalizer import ZScore
|
||||
|
||||
|
||||
def evaluate(args, fastspeech2_config, pwg_config):
|
||||
# dataloader has been too verbose
|
||||
logging.getLogger("DataLoader").disabled = True
|
||||
|
||||
# construct dataset for evaluation
|
||||
with jsonlines.open(args.test_metadata, 'r') as reader:
|
||||
test_metadata = list(reader)
|
||||
test_dataset = DataTable(
|
||||
data=test_metadata, fields=["utt_id", "text", "spk_id"])
|
||||
|
||||
with open(args.phones_dict, "r") as f:
|
||||
phn_id = [line.strip().split() for line in f.readlines()]
|
||||
vocab_size = len(phn_id)
|
||||
print("vocab_size:", vocab_size)
|
||||
|
||||
with open(args.speaker_dict, 'rt') as f:
|
||||
spk_id = [line.strip().split() for line in f.readlines()]
|
||||
num_speakers = len(spk_id)
|
||||
print("num_speakers:", num_speakers)
|
||||
|
||||
odim = fastspeech2_config.n_mels
|
||||
model = FastSpeech2(
|
||||
idim=vocab_size,
|
||||
odim=odim,
|
||||
num_speakers=num_speakers,
|
||||
**fastspeech2_config["model"])
|
||||
|
||||
model.set_state_dict(
|
||||
paddle.load(args.fastspeech2_checkpoint)["main_params"])
|
||||
model.eval()
|
||||
|
||||
vocoder = PWGGenerator(**pwg_config["generator_params"])
|
||||
vocoder.set_state_dict(paddle.load(args.pwg_params))
|
||||
vocoder.remove_weight_norm()
|
||||
vocoder.eval()
|
||||
print("model done!")
|
||||
|
||||
stat = np.load(args.fastspeech2_stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
fastspeech2_normalizer = ZScore(mu, std)
|
||||
|
||||
stat = np.load(args.pwg_stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
pwg_normalizer = ZScore(mu, std)
|
||||
|
||||
fastspeech2_inferencce = FastSpeech2Inference(fastspeech2_normalizer, model)
|
||||
pwg_inference = PWGInference(pwg_normalizer, vocoder)
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for datum in test_dataset:
|
||||
utt_id = datum["utt_id"]
|
||||
text = paddle.to_tensor(datum["text"])
|
||||
spk_id = paddle.to_tensor(datum["spk_id"])
|
||||
|
||||
with paddle.no_grad():
|
||||
wav = pwg_inference(fastspeech2_inferencce(text, spk_id=spk_id))
|
||||
sf.write(
|
||||
str(output_dir / (utt_id + ".wav")),
|
||||
wav.numpy(),
|
||||
samplerate=fastspeech2_config.fs)
|
||||
print(f"{utt_id} done!")
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize with fastspeech2 & parallel wavegan.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-config", type=str, help="fastspeech2 config file.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-checkpoint",
|
||||
type=str,
|
||||
help="fastspeech2 checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pwg-config", type=str, help="parallel wavegan config file.")
|
||||
parser.add_argument(
|
||||
"--pwg-params",
|
||||
type=str,
|
||||
help="parallel wavegan generator parameters to load.")
|
||||
parser.add_argument(
|
||||
"--pwg-stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--phones-dict",
|
||||
type=str,
|
||||
default="phone_id_map.txt",
|
||||
help="phone vocabulary file.")
|
||||
parser.add_argument(
|
||||
"--speaker-dict",
|
||||
type=str,
|
||||
default="speaker_id_map.txt ",
|
||||
help="speaker id map file.")
|
||||
parser.add_argument("--test-metadata", type=str, help="test metadata.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
with open(args.fastspeech2_config) as f:
|
||||
fastspeech2_config = CfgNode(yaml.safe_load(f))
|
||||
with open(args.pwg_config) as f:
|
||||
pwg_config = CfgNode(yaml.safe_load(f))
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(fastspeech2_config)
|
||||
print(pwg_config)
|
||||
|
||||
evaluate(args, fastspeech2_config, pwg_config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,14 @@
|
|||
#!/bin/bash
|
||||
|
||||
python3 synthesize.py \
|
||||
--fastspeech2-config=conf/default.yaml \
|
||||
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_96400.pdz \
|
||||
--fastspeech2-stat=dump/train/speech_stats.npy \
|
||||
--pwg-config=parallel_wavegan_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--pwg-params=parallel_wavegan_baker_ckpt_0.4/pwg_generator.pdparams \
|
||||
--pwg-stat=parallel_wavegan_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=exp/default/test \
|
||||
--device="gpu" \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
|
@ -0,0 +1,174 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import soundfile as sf
|
||||
import yaml
|
||||
from yacs.config import CfgNode
|
||||
from parakeet.models.fastspeech2 import FastSpeech2
|
||||
from parakeet.models.fastspeech2 import FastSpeech2Inference
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator
|
||||
from parakeet.models.parallel_wavegan import PWGInference
|
||||
from parakeet.modules.normalizer import ZScore
|
||||
|
||||
from frontend import Frontend
|
||||
|
||||
|
||||
def evaluate(args, fastspeech2_config, pwg_config):
|
||||
# dataloader has been too verbose
|
||||
logging.getLogger("DataLoader").disabled = True
|
||||
|
||||
# construct dataset for evaluation
|
||||
sentences = []
|
||||
with open(args.text, 'rt') as f:
|
||||
for line in f:
|
||||
utt_id, sentence = line.strip().split()
|
||||
sentences.append((utt_id, sentence))
|
||||
|
||||
with open(args.phones_dict, "r") as f:
|
||||
phn_id = [line.strip().split() for line in f.readlines()]
|
||||
vocab_size = len(phn_id)
|
||||
print("vocab_size:", vocab_size)
|
||||
with open(args.speaker_dict, 'rt') as f:
|
||||
spk_id = [line.strip().split() for line in f.readlines()]
|
||||
num_speakers = len(spk_id)
|
||||
print("num_speakers:", num_speakers)
|
||||
|
||||
odim = fastspeech2_config.n_mels
|
||||
model = FastSpeech2(
|
||||
idim=vocab_size,
|
||||
odim=odim,
|
||||
num_speakers=num_speakers,
|
||||
**fastspeech2_config["model"])
|
||||
|
||||
model.set_state_dict(
|
||||
paddle.load(args.fastspeech2_checkpoint)["main_params"])
|
||||
model.eval()
|
||||
|
||||
vocoder = PWGGenerator(**pwg_config["generator_params"])
|
||||
vocoder.set_state_dict(paddle.load(args.pwg_params))
|
||||
vocoder.remove_weight_norm()
|
||||
vocoder.eval()
|
||||
print("model done!")
|
||||
|
||||
frontend = Frontend(args.phones_dict)
|
||||
print("frontend done!")
|
||||
|
||||
stat = np.load(args.fastspeech2_stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
fastspeech2_normalizer = ZScore(mu, std)
|
||||
|
||||
stat = np.load(args.pwg_stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
pwg_normalizer = ZScore(mu, std)
|
||||
|
||||
fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)
|
||||
pwg_inference = PWGInference(pwg_normalizer, vocoder)
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
# only test the number 0 speaker
|
||||
spk_id = 0
|
||||
for utt_id, sentence in sentences:
|
||||
input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
|
||||
phone_ids = input_ids["phone_ids"]
|
||||
flags = 0
|
||||
for part_phone_ids in phone_ids:
|
||||
with paddle.no_grad():
|
||||
mel = fastspeech2_inference(
|
||||
part_phone_ids, spk_id=paddle.to_tensor(spk_id))
|
||||
temp_wav = pwg_inference(mel)
|
||||
if flags == 0:
|
||||
wav = temp_wav
|
||||
flags = 1
|
||||
else:
|
||||
wav = paddle.concat([wav, temp_wav])
|
||||
sf.write(
|
||||
str(output_dir / (str(spk_id) + "_" + utt_id + ".wav")),
|
||||
wav.numpy(),
|
||||
samplerate=fastspeech2_config.fs)
|
||||
print(f"{utt_id} done!")
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize with fastspeech2 & parallel wavegan.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-config", type=str, help="fastspeech2 config file.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-checkpoint",
|
||||
type=str,
|
||||
help="fastspeech2 checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pwg-config", type=str, help="parallel wavegan config file.")
|
||||
parser.add_argument(
|
||||
"--pwg-params",
|
||||
type=str,
|
||||
help="parallel wavegan generator parameters to load.")
|
||||
parser.add_argument(
|
||||
"--pwg-stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--phones-dict",
|
||||
type=str,
|
||||
default="phone_id_map.txt",
|
||||
help="phone vocabulary file.")
|
||||
parser.add_argument(
|
||||
"--speaker-dict",
|
||||
type=str,
|
||||
default="speaker_id_map.txt ",
|
||||
help="speaker id map file.")
|
||||
parser.add_argument(
|
||||
"--text",
|
||||
type=str,
|
||||
help="text to synthesize, a 'utt_id sentence' pair per line.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
with open(args.fastspeech2_config) as f:
|
||||
fastspeech2_config = CfgNode(yaml.safe_load(f))
|
||||
with open(args.pwg_config) as f:
|
||||
pwg_config = CfgNode(yaml.safe_load(f))
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(fastspeech2_config)
|
||||
print(pwg_config)
|
||||
|
||||
evaluate(args, fastspeech2_config, pwg_config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,15 @@
|
|||
|
||||
#!/bin/bash
|
||||
|
||||
python3 synthesize_e2e.py \
|
||||
--fastspeech2-config=conf/default.yaml \
|
||||
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_96400.pdz \
|
||||
--fastspeech2-stat=dump/train/speech_stats.npy \
|
||||
--pwg-config=parallel_wavegan_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--pwg-params=parallel_wavegan_baker_ckpt_0.4/pwg_generator.pdparams \
|
||||
--pwg-stat=parallel_wavegan_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--text=../sentences.txt \
|
||||
--output-dir=exp/default/test_e2e \
|
||||
--device="gpu" \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
|
@ -0,0 +1,232 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import DataParallel
|
||||
from paddle import distributed as dist
|
||||
from paddle import nn
|
||||
from paddle.io import DataLoader
|
||||
from paddle.io import DistributedBatchSampler
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.models.fastspeech2 import FastSpeech2
|
||||
from parakeet.training.extensions.snapshot import Snapshot
|
||||
from parakeet.training.extensions.visualizer import VisualDL
|
||||
from parakeet.training.seeding import seed_everything
|
||||
from parakeet.training.trainer import Trainer
|
||||
from visualdl import LogWriter
|
||||
import yaml
|
||||
|
||||
from batch_fn import collate_aishell3_examples
|
||||
from config import get_cfg_default
|
||||
from fastspeech2_updater import FastSpeech2Evaluator
|
||||
from fastspeech2_updater import FastSpeech2Updater
|
||||
|
||||
optim_classes = dict(
|
||||
adadelta=paddle.optimizer.Adadelta,
|
||||
adagrad=paddle.optimizer.Adagrad,
|
||||
adam=paddle.optimizer.Adam,
|
||||
adamax=paddle.optimizer.Adamax,
|
||||
adamw=paddle.optimizer.AdamW,
|
||||
lamb=paddle.optimizer.Lamb,
|
||||
momentum=paddle.optimizer.Momentum,
|
||||
rmsprop=paddle.optimizer.RMSProp,
|
||||
sgd=paddle.optimizer.SGD, )
|
||||
|
||||
|
||||
def build_optimizers(model: nn.Layer, optim='adadelta',
|
||||
learning_rate=0.01) -> paddle.optimizer:
|
||||
optim_class = optim_classes.get(optim)
|
||||
if optim_class is None:
|
||||
raise ValueError(f"must be one of {list(optim_classes)}: {optim}")
|
||||
else:
|
||||
optim = optim_class(
|
||||
parameters=model.parameters(), learning_rate=learning_rate)
|
||||
|
||||
optimizers = optim
|
||||
return optimizers
|
||||
|
||||
|
||||
def train_sp(args, config):
|
||||
# decides device type and whether to run in parallel
|
||||
# setup running environment correctly
|
||||
if not paddle.is_compiled_with_cuda():
|
||||
paddle.set_device("cpu")
|
||||
else:
|
||||
paddle.set_device("gpu")
|
||||
world_size = paddle.distributed.get_world_size()
|
||||
if world_size > 1:
|
||||
paddle.distributed.init_parallel_env()
|
||||
|
||||
# set the random seed, it is a must for multiprocess training
|
||||
seed_everything(config.seed)
|
||||
|
||||
print(
|
||||
f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
|
||||
)
|
||||
|
||||
# dataloader has been too verbose
|
||||
logging.getLogger("DataLoader").disabled = True
|
||||
|
||||
# construct dataset for training and validation
|
||||
with jsonlines.open(args.train_metadata, 'r') as reader:
|
||||
train_metadata = list(reader)
|
||||
train_dataset = DataTable(
|
||||
data=train_metadata,
|
||||
fields=[
|
||||
"text", "text_lengths", "speech", "speech_lengths", "durations",
|
||||
"pitch", "energy", "spk_id"
|
||||
],
|
||||
converters={"speech": np.load,
|
||||
"pitch": np.load,
|
||||
"energy": np.load}, )
|
||||
with jsonlines.open(args.dev_metadata, 'r') as reader:
|
||||
dev_metadata = list(reader)
|
||||
|
||||
dev_dataset = DataTable(
|
||||
data=dev_metadata,
|
||||
fields=[
|
||||
"text", "text_lengths", "speech", "speech_lengths", "durations",
|
||||
"pitch", "energy", "spk_id"
|
||||
],
|
||||
converters={"speech": np.load,
|
||||
"pitch": np.load,
|
||||
"energy": np.load}, )
|
||||
|
||||
# collate function and dataloader
|
||||
train_sampler = DistributedBatchSampler(
|
||||
train_dataset,
|
||||
batch_size=config.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
|
||||
print("samplers done!")
|
||||
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset,
|
||||
batch_sampler=train_sampler,
|
||||
collate_fn=collate_aishell3_examples,
|
||||
num_workers=config.num_workers)
|
||||
|
||||
dev_dataloader = DataLoader(
|
||||
dev_dataset,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
batch_size=config.batch_size,
|
||||
collate_fn=collate_aishell3_examples,
|
||||
num_workers=config.num_workers)
|
||||
print("dataloaders done!")
|
||||
|
||||
with open(args.phones_dict, "r") as f:
|
||||
phn_id = [line.strip().split() for line in f.readlines()]
|
||||
vocab_size = len(phn_id)
|
||||
print("vocab_size:", vocab_size)
|
||||
|
||||
with open(args.speaker_dict, 'rt') as f:
|
||||
spk_id = [line.strip().split() for line in f.readlines()]
|
||||
num_speakers = len(spk_id)
|
||||
print("num_speakers:", num_speakers)
|
||||
|
||||
odim = config.n_mels
|
||||
model = FastSpeech2(
|
||||
idim=vocab_size,
|
||||
odim=odim,
|
||||
num_speakers=num_speakers,
|
||||
**config["model"])
|
||||
if world_size > 1:
|
||||
model = DataParallel(model)
|
||||
print("model done!")
|
||||
|
||||
optimizer = build_optimizers(model, **config["optimizer"])
|
||||
print("optimizer done!")
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
updater = FastSpeech2Updater(
|
||||
model=model,
|
||||
optimizer=optimizer,
|
||||
dataloader=train_dataloader,
|
||||
output_dir=output_dir,
|
||||
**config["updater"])
|
||||
|
||||
trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
|
||||
|
||||
evaluator = FastSpeech2Evaluator(
|
||||
model, dev_dataloader, output_dir=output_dir, **config["updater"])
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
trainer.extend(evaluator, trigger=(1, "epoch"))
|
||||
writer = LogWriter(str(output_dir))
|
||||
trainer.extend(VisualDL(writer), trigger=(1, "iteration"))
|
||||
trainer.extend(
|
||||
Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
|
||||
# print(trainer.extensions)
|
||||
trainer.run()
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(description="Train a FastSpeech2 "
|
||||
"model with Baker Mandrin TTS dataset.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data.")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--nprocs", type=int, default=1, help="number of processes.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
parser.add_argument(
|
||||
"--phones-dict",
|
||||
type=str,
|
||||
default="phone_id_map.txt ",
|
||||
help="phone vocabulary file.")
|
||||
parser.add_argument(
|
||||
"--speaker-dict",
|
||||
type=str,
|
||||
default="speaker_id_map.txt ",
|
||||
help="speaker id map file.")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.device == "cpu" and args.nprocs > 1:
|
||||
raise RuntimeError("Multiprocess training on CPU is not supported.")
|
||||
config = get_cfg_default()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(config)
|
||||
print(
|
||||
f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
|
||||
)
|
||||
|
||||
# dispatch
|
||||
if args.nprocs > 1:
|
||||
dist.spawn(train_sp, (args, config), nprocs=args.nprocs)
|
||||
else:
|
||||
train_sp(args, config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,63 @@
|
|||
# FastSpeech2 with the Baker dataset
|
||||
This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2006.04558) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
|
||||
|
||||
## Dataset
|
||||
|
||||
### Download and Extract the datasaet
|
||||
Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
|
||||
|
||||
### Get MFA result of CSMSC and Extract it
|
||||
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
|
||||
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.
|
||||
|
||||
### Preprocess the dataset
|
||||
Assume the path to the dataset is `~/datasets/BZNSYP`.
|
||||
Assume the path to the MFA result of BZNSYP is `./baker_alignment_tone`.
|
||||
Run the command below to preprocess the dataset.
|
||||
|
||||
```bash
|
||||
./preprocess.sh
|
||||
```
|
||||
|
||||
## Train the model
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
If you want to train fastspeech2 with cpu, please add `--device=cpu` arguments for `python3 train.py` in `run.sh`.
|
||||
|
||||
## Synthesize
|
||||
We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.
|
||||
Download pretrained parallel wavegan model from [parallel_wavegan_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/parallel_wavegan_baker_ckpt_0.4.zip) and unzip it.
|
||||
```bash
|
||||
unzip parallel_wavegan_baker_ckpt_0.4.zip
|
||||
```
|
||||
`synthesize.sh` can synthesize waveform from `metadata.jsonl`.
|
||||
`synthesize_e2e.sh` can synthesize waveform from text list.
|
||||
|
||||
```bash
|
||||
./synthesize.sh
|
||||
```
|
||||
or
|
||||
```bash
|
||||
./synthesize_e2e.sh
|
||||
```
|
||||
|
||||
You can see the bash files for more datails of input parameters.
|
||||
|
||||
## Pretrained Model
|
||||
Pretrained Model with no sil in the edge of audios can be downloaded here. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
|
||||
|
||||
Then, you can use the following scripts to synthesize for `../sentences.txt` using pretrained fastspeech2 model.
|
||||
```bash
|
||||
python3 synthesize_e2e.py \
|
||||
--fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
|
||||
--fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
|
||||
--fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
|
||||
--pwg-config=parallel_wavegan_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--pwg-params=parallel_wavegan_baker_ckpt_0.4/pwg_generator.pdparams \
|
||||
--pwg-stat=parallel_wavegan_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--text=../sentences.txt \
|
||||
--output-dir=exp/default/test_e2e \
|
||||
--device="gpu" \
|
||||
--phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
|
||||
```
|
|
@ -0,0 +1,56 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
from parakeet.data.batch import batch_sequences
|
||||
|
||||
|
||||
def collate_baker_examples(examples):
|
||||
# fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy"]
|
||||
text = [np.array(item["text"], dtype=np.int64) for item in examples]
|
||||
speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
|
||||
pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
|
||||
energy = [np.array(item["energy"], dtype=np.float32) for item in examples]
|
||||
durations = [
|
||||
np.array(item["durations"], dtype=np.int64) for item in examples
|
||||
]
|
||||
text_lengths = np.array([item["text_lengths"] for item in examples])
|
||||
speech_lengths = np.array([item["speech_lengths"] for item in examples])
|
||||
|
||||
text = batch_sequences(text)
|
||||
pitch = batch_sequences(pitch)
|
||||
speech = batch_sequences(speech)
|
||||
durations = batch_sequences(durations)
|
||||
energy = batch_sequences(energy)
|
||||
|
||||
# convert each batch to paddle.Tensor
|
||||
text = paddle.to_tensor(text)
|
||||
pitch = paddle.to_tensor(pitch)
|
||||
speech = paddle.to_tensor(speech)
|
||||
durations = paddle.to_tensor(durations)
|
||||
energy = paddle.to_tensor(energy)
|
||||
text_lengths = paddle.to_tensor(text_lengths)
|
||||
speech_lengths = paddle.to_tensor(speech_lengths)
|
||||
|
||||
batch = {
|
||||
"text": text,
|
||||
"text_lengths": text_lengths,
|
||||
"durations": durations,
|
||||
"speech": speech,
|
||||
"speech_lengths": speech_lengths,
|
||||
"pitch": pitch,
|
||||
"energy": energy
|
||||
}
|
||||
return batch
|
|
@ -0,0 +1,104 @@
|
|||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
|
||||
fs: 24000 # sr
|
||||
n_fft: 2048 # FFT size.
|
||||
n_shift: 300 # Hop size.
|
||||
win_length: 1200 # Window length.
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
|
||||
# Only used for feats_type != raw
|
||||
|
||||
fmin: 80 # Minimum frequency of Mel basis.
|
||||
fmax: 7600 # Maximum frequency of Mel basis.
|
||||
n_mels: 80 # The number of mel basis.
|
||||
|
||||
# Only used for the model using pitch features (e.g. FastSpeech2)
|
||||
f0min: 80 # Maximum f0 for pitch extraction.
|
||||
f0max: 400 # Minimum f0 for pitch extraction.
|
||||
|
||||
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
batch_size: 64
|
||||
num_workers: 4
|
||||
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model:
|
||||
adim: 384 # attention dimension
|
||||
aheads: 2 # number of attention heads
|
||||
elayers: 4 # number of encoder layers
|
||||
eunits: 1536 # number of encoder ff units
|
||||
dlayers: 4 # number of decoder layers
|
||||
dunits: 1536 # number of decoder ff units
|
||||
positionwise_layer_type: conv1d # type of position-wise layer
|
||||
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
|
||||
duration_predictor_layers: 2 # number of layers of duration predictor
|
||||
duration_predictor_chans: 256 # number of channels of duration predictor
|
||||
duration_predictor_kernel_size: 3 # filter size of duration predictor
|
||||
postnet_layers: 5 # number of layers of postnset
|
||||
postnet_filts: 5 # filter size of conv layers in postnet
|
||||
postnet_chans: 256 # number of channels of conv layers in postnet
|
||||
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||
use_scaled_pos_enc: True # whether to use scaled positional encoding
|
||||
encoder_normalize_before: True # whether to perform layer normalization before the input
|
||||
decoder_normalize_before: True # whether to perform layer normalization before the input
|
||||
reduction_factor: 1 # reduction factor
|
||||
init_type: xavier_uniform # initialization type
|
||||
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
|
||||
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
|
||||
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
|
||||
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
|
||||
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
|
||||
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
|
||||
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
|
||||
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
|
||||
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
|
||||
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
|
||||
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
|
||||
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
|
||||
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
|
||||
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
|
||||
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
|
||||
energy_predictor_layers: 2 # number of conv layers in energy predictor
|
||||
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
|
||||
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
|
||||
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
|
||||
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
|
||||
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
|
||||
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
|
||||
|
||||
|
||||
|
||||
###########################################################
|
||||
# UPDATER SETTING #
|
||||
###########################################################
|
||||
updater:
|
||||
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||
|
||||
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
optimizer:
|
||||
optim: adam # optimizer type
|
||||
learning_rate: 0.001 # learning rate
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 1000
|
||||
num_snapshots: 5
|
||||
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
seed: 10086
|
|
@ -0,0 +1,28 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from yacs.config import CfgNode as Configuration
|
||||
|
||||
config_path = (Path(__file__).parent / "conf" / "default.yaml").resolve()
|
||||
|
||||
with open(config_path, 'rt') as f:
|
||||
_C = yaml.safe_load(f)
|
||||
_C = Configuration(_C)
|
||||
|
||||
|
||||
def get_cfg_default():
|
||||
config = _C.clone()
|
||||
return config
|
|
@ -0,0 +1,158 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import logging
|
||||
|
||||
from paddle import distributed as dist
|
||||
from parakeet.models.fastspeech2 import FastSpeech2Loss
|
||||
from parakeet.training.extensions.evaluator import StandardEvaluator
|
||||
from parakeet.training.reporter import report
|
||||
from parakeet.training.updaters.standard_updater import StandardUpdater
|
||||
logging.basicConfig(
|
||||
format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
|
||||
datefmt='[%Y-%m-%d %H:%M:%S]')
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class FastSpeech2Updater(StandardUpdater):
|
||||
def __init__(self,
|
||||
model,
|
||||
optimizer,
|
||||
dataloader,
|
||||
init_state=None,
|
||||
use_masking=False,
|
||||
use_weighted_masking=False,
|
||||
output_dir=None):
|
||||
super().__init__(model, optimizer, dataloader, init_state=None)
|
||||
self.use_masking = use_masking
|
||||
self.use_weighted_masking = use_weighted_masking
|
||||
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
|
||||
self.filehandler = logging.FileHandler(str(log_file))
|
||||
logger.addHandler(self.filehandler)
|
||||
self.logger = logger
|
||||
self.msg = ""
|
||||
|
||||
def update_core(self, batch):
|
||||
self.msg = "Rank: {}, ".format(dist.get_rank())
|
||||
losses_dict = {}
|
||||
|
||||
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
|
||||
text=batch["text"],
|
||||
text_lengths=batch["text_lengths"],
|
||||
speech=batch["speech"],
|
||||
speech_lengths=batch["speech_lengths"],
|
||||
durations=batch["durations"],
|
||||
pitch=batch["pitch"],
|
||||
energy=batch["energy"], )
|
||||
|
||||
criterion = FastSpeech2Loss(
|
||||
use_masking=self.use_masking,
|
||||
use_weighted_masking=self.use_weighted_masking)
|
||||
|
||||
l1_loss, duration_loss, pitch_loss, energy_loss = criterion(
|
||||
after_outs=after_outs,
|
||||
before_outs=before_outs,
|
||||
d_outs=d_outs,
|
||||
p_outs=p_outs,
|
||||
e_outs=e_outs,
|
||||
ys=ys,
|
||||
ds=batch["durations"],
|
||||
ps=batch["pitch"],
|
||||
es=batch["energy"],
|
||||
ilens=batch["text_lengths"],
|
||||
olens=olens)
|
||||
|
||||
loss = l1_loss + duration_loss + pitch_loss + energy_loss
|
||||
|
||||
optimizer = self.optimizer
|
||||
optimizer.clear_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
report("train/loss", float(loss))
|
||||
report("train/l1_loss", float(l1_loss))
|
||||
report("train/duration_loss", float(duration_loss))
|
||||
report("train/pitch_loss", float(pitch_loss))
|
||||
report("train/energy_loss", float(energy_loss))
|
||||
|
||||
losses_dict["l1_loss"] = float(l1_loss)
|
||||
losses_dict["duration_loss"] = float(duration_loss)
|
||||
losses_dict["pitch_loss"] = float(pitch_loss)
|
||||
losses_dict["energy_loss"] = float(energy_loss)
|
||||
losses_dict["loss"] = float(loss)
|
||||
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_dict.items())
|
||||
|
||||
|
||||
class FastSpeech2Evaluator(StandardEvaluator):
|
||||
def __init__(self,
|
||||
model,
|
||||
dataloader,
|
||||
use_masking=False,
|
||||
use_weighted_masking=False,
|
||||
output_dir=None):
|
||||
super().__init__(model, dataloader)
|
||||
self.use_masking = use_masking
|
||||
self.use_weighted_masking = use_weighted_masking
|
||||
|
||||
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
|
||||
self.filehandler = logging.FileHandler(str(log_file))
|
||||
logger.addHandler(self.filehandler)
|
||||
self.logger = logger
|
||||
self.msg = ""
|
||||
|
||||
def evaluate_core(self, batch):
|
||||
self.msg = "Evaluate: "
|
||||
losses_dict = {}
|
||||
|
||||
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
|
||||
text=batch["text"],
|
||||
text_lengths=batch["text_lengths"],
|
||||
speech=batch["speech"],
|
||||
speech_lengths=batch["speech_lengths"],
|
||||
durations=batch["durations"],
|
||||
pitch=batch["pitch"],
|
||||
energy=batch["energy"])
|
||||
|
||||
criterion = FastSpeech2Loss(
|
||||
use_masking=self.use_masking,
|
||||
use_weighted_masking=self.use_weighted_masking)
|
||||
l1_loss, duration_loss, pitch_loss, energy_loss = criterion(
|
||||
after_outs=after_outs,
|
||||
before_outs=before_outs,
|
||||
d_outs=d_outs,
|
||||
p_outs=p_outs,
|
||||
e_outs=e_outs,
|
||||
ys=ys,
|
||||
ds=batch["durations"],
|
||||
ps=batch["pitch"],
|
||||
es=batch["energy"],
|
||||
ilens=batch["text_lengths"],
|
||||
olens=olens, )
|
||||
loss = l1_loss + duration_loss + pitch_loss + energy_loss
|
||||
|
||||
report("eval/loss", float(loss))
|
||||
report("eval/l1_loss", float(l1_loss))
|
||||
report("eval/duration_loss", float(duration_loss))
|
||||
report("eval/pitch_loss", float(pitch_loss))
|
||||
report("eval/energy_loss", float(energy_loss))
|
||||
|
||||
losses_dict["l1_loss"] = float(l1_loss)
|
||||
losses_dict["duration_loss"] = float(duration_loss)
|
||||
losses_dict["pitch_loss"] = float(pitch_loss)
|
||||
losses_dict["energy_loss"] = float(energy_loss)
|
||||
losses_dict["loss"] = float(loss)
|
||||
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_dict.items())
|
||||
self.logger.info(self.msg)
|
|
@ -0,0 +1,121 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import re
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
from parakeet.frontend.cn_frontend import Frontend as cnFrontend
|
||||
|
||||
|
||||
class Frontend():
|
||||
def __init__(self, phone_vocab_path=None, tone_vocab_path=None):
|
||||
self.frontend = cnFrontend()
|
||||
self.vocab_phones = {}
|
||||
self.vocab_tones = {}
|
||||
if phone_vocab_path:
|
||||
with open(phone_vocab_path, 'rt') as f:
|
||||
phn_id = [line.strip().split() for line in f.readlines()]
|
||||
for phn, id in phn_id:
|
||||
self.vocab_phones[phn] = int(id)
|
||||
if tone_vocab_path:
|
||||
with open(tone_vocab_path, 'rt') as f:
|
||||
tone_id = [line.strip().split() for line in f.readlines()]
|
||||
for tone, id in tone_id:
|
||||
self.vocab_tones[tone] = int(id)
|
||||
|
||||
def _p2id(self, phonemes: List[str]) -> np.array:
|
||||
# replace unk phone with sp
|
||||
phonemes = [
|
||||
phn if phn in self.vocab_phones else "sp" for phn in phonemes
|
||||
]
|
||||
phone_ids = [self.vocab_phones[item] for item in phonemes]
|
||||
return np.array(phone_ids, np.int64)
|
||||
|
||||
def _t2id(self, tones: List[str]) -> np.array:
|
||||
# replace unk phone with sp
|
||||
tones = [tone if tone in self.vocab_tones else "0" for tone in tones]
|
||||
tone_ids = [self.vocab_tones[item] for item in tones]
|
||||
return np.array(tone_ids, np.int64)
|
||||
|
||||
def _get_phone_tone(self, phonemes: List[str],
|
||||
get_tone_ids: bool=False) -> List[List[str]]:
|
||||
phones = []
|
||||
tones = []
|
||||
if get_tone_ids and self.vocab_tones:
|
||||
for full_phone in phonemes:
|
||||
# split tone from finals
|
||||
match = re.match(r'^(\w+)([012345])$', full_phone)
|
||||
if match:
|
||||
phone = match.group(1)
|
||||
tone = match.group(2)
|
||||
# if the merged erhua not in the vocab
|
||||
# assume that the input is ['iaor3'] and 'iaor' not in self.vocab_phones, we split 'iaor' into ['iao','er']
|
||||
# and the tones accordingly change from ['3'] to ['3','2'], while '2' is the tone of 'er2'
|
||||
if len(phone) >= 2 and phone != "er" and phone[
|
||||
-1] == 'r' and phone not in self.vocab_phones and phone[:
|
||||
-1] in self.vocab_phones:
|
||||
phones.append(phone[:-1])
|
||||
phones.append("er")
|
||||
tones.append(tone)
|
||||
tones.append("2")
|
||||
else:
|
||||
phones.append(phone)
|
||||
tones.append(tone)
|
||||
else:
|
||||
phones.append(full_phone)
|
||||
tones.append('0')
|
||||
else:
|
||||
for phone in phonemes:
|
||||
# if the merged erhua not in the vocab
|
||||
# assume that the input is ['iaor3'] and 'iaor' not in self.vocab_phones, change ['iaor3'] to ['iao3','er2']
|
||||
if len(phone) >= 3 and phone[:-1] != "er" and phone[
|
||||
-2] == 'r' and phone not in self.vocab_phones and (
|
||||
phone[:-2] + phone[-1]) in self.vocab_phones:
|
||||
phones.append((phone[:-2] + phone[-1]))
|
||||
phones.append("er2")
|
||||
else:
|
||||
phones.append(phone)
|
||||
return phones, tones
|
||||
|
||||
def get_input_ids(
|
||||
self,
|
||||
sentence: str,
|
||||
merge_sentences: bool=True,
|
||||
get_tone_ids: bool=False) -> Dict[str, List[paddle.Tensor]]:
|
||||
phonemes = self.frontend.get_phonemes(
|
||||
sentence, merge_sentences=merge_sentences)
|
||||
result = {}
|
||||
phones = []
|
||||
tones = []
|
||||
temp_phone_ids = []
|
||||
temp_tone_ids = []
|
||||
for part_phonemes in phonemes:
|
||||
phones, tones = self._get_phone_tone(
|
||||
part_phonemes, get_tone_ids=get_tone_ids)
|
||||
if tones:
|
||||
tone_ids = self._t2id(tones)
|
||||
tone_ids = paddle.to_tensor(tone_ids)
|
||||
temp_tone_ids.append(tone_ids)
|
||||
if phones:
|
||||
phone_ids = self._p2id(phones)
|
||||
phone_ids = paddle.to_tensor(phone_ids)
|
||||
temp_phone_ids.append(phone_ids)
|
||||
if temp_tone_ids:
|
||||
result["tone_ids"] = temp_tone_ids
|
||||
if temp_phone_ids:
|
||||
result["phone_ids"] = temp_phone_ids
|
||||
return result
|
|
@ -0,0 +1,78 @@
|
|||
#!/bin/bash
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
fs=24000
|
||||
n_shift=300
|
||||
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# get durations from MFA's result
|
||||
echo "Generate durations.txt from MFA results ..."
|
||||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
|
||||
--inputdir=./baker_alignment_tone \
|
||||
--output=durations.txt \
|
||||
--sample-rate=${fs} \
|
||||
--n-shift=${n_shift}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# extract features
|
||||
echo "Extract features ..."
|
||||
python3 ${MAIN_ROOT}/utils/fastspeech2_preprocess.py \
|
||||
--dataset=baker \
|
||||
--rootdir=~/datasets/BZNSYP/ \
|
||||
--dumpdir=dump \
|
||||
--dur-file=durations.txt \
|
||||
--config-path=conf/default.yaml \
|
||||
--num-cpu=8 \
|
||||
--cut-sil=True
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="speech"
|
||||
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="pitch"
|
||||
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="energy"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# normalize and covert phone to id, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
python3 ${MAIN_ROOT}/utils/fastspeech2_normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--pitch-stats=dump/train/pitch_stats.npy \
|
||||
--energy-stats=dump/train/energy_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${MAIN_ROOT}/utils/fastspeech2_normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--pitch-stats=dump/train/pitch_stats.npy \
|
||||
--energy-stats=dump/train/energy_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${MAIN_ROOT}/utils/fastspeech2_normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--pitch-stats=dump/train/pitch_stats.npy \
|
||||
--energy-stats=dump/train/energy_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
fi
|
|
@ -0,0 +1,9 @@
|
|||
#!/bin/bash
|
||||
|
||||
python3 train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=conf/default.yaml \
|
||||
--output-dir=exp/default \
|
||||
--nprocs=1 \
|
||||
--phones-dict=dump/phone_id_map.txt
|
|
@ -0,0 +1,144 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
import paddle
|
||||
import soundfile as sf
|
||||
import yaml
|
||||
from yacs.config import CfgNode
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.models.fastspeech2 import FastSpeech2
|
||||
from parakeet.models.fastspeech2 import FastSpeech2Inference
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator
|
||||
from parakeet.models.parallel_wavegan import PWGInference
|
||||
from parakeet.modules.normalizer import ZScore
|
||||
|
||||
|
||||
def evaluate(args, fastspeech2_config, pwg_config):
|
||||
# dataloader has been too verbose
|
||||
logging.getLogger("DataLoader").disabled = True
|
||||
|
||||
# construct dataset for evaluation
|
||||
with jsonlines.open(args.test_metadata, 'r') as reader:
|
||||
test_metadata = list(reader)
|
||||
test_dataset = DataTable(data=test_metadata, fields=["utt_id", "text"])
|
||||
|
||||
with open(args.phones_dict, "r") as f:
|
||||
phn_id = [line.strip().split() for line in f.readlines()]
|
||||
vocab_size = len(phn_id)
|
||||
print("vocab_size:", vocab_size)
|
||||
odim = fastspeech2_config.n_mels
|
||||
model = FastSpeech2(
|
||||
idim=vocab_size, odim=odim, **fastspeech2_config["model"])
|
||||
|
||||
model.set_state_dict(
|
||||
paddle.load(args.fastspeech2_checkpoint)["main_params"])
|
||||
model.eval()
|
||||
|
||||
vocoder = PWGGenerator(**pwg_config["generator_params"])
|
||||
vocoder.set_state_dict(paddle.load(args.pwg_params))
|
||||
vocoder.remove_weight_norm()
|
||||
vocoder.eval()
|
||||
print("model done!")
|
||||
|
||||
stat = np.load(args.fastspeech2_stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
fastspeech2_normalizer = ZScore(mu, std)
|
||||
|
||||
stat = np.load(args.pwg_stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
pwg_normalizer = ZScore(mu, std)
|
||||
|
||||
fastspeech2_inferencce = FastSpeech2Inference(fastspeech2_normalizer, model)
|
||||
pwg_inference = PWGInference(pwg_normalizer, vocoder)
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for datum in test_dataset:
|
||||
utt_id = datum["utt_id"]
|
||||
text = paddle.to_tensor(datum["text"])
|
||||
|
||||
with paddle.no_grad():
|
||||
wav = pwg_inference(fastspeech2_inferencce(text))
|
||||
sf.write(
|
||||
str(output_dir / (utt_id + ".wav")),
|
||||
wav.numpy(),
|
||||
samplerate=fastspeech2_config.fs)
|
||||
print(f"{utt_id} done!")
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize with fastspeech2 & parallel wavegan.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-config", type=str, help="fastspeech2 config file.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-checkpoint",
|
||||
type=str,
|
||||
help="fastspeech2 checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pwg-config", type=str, help="parallel wavegan config file.")
|
||||
parser.add_argument(
|
||||
"--pwg-params",
|
||||
type=str,
|
||||
help="parallel wavegan generator parameters to load.")
|
||||
parser.add_argument(
|
||||
"--pwg-stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--phones-dict",
|
||||
type=str,
|
||||
default="phone_id_map.txt",
|
||||
help="phone vocabulary file.")
|
||||
parser.add_argument("--test-metadata", type=str, help="test metadata.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
with open(args.fastspeech2_config) as f:
|
||||
fastspeech2_config = CfgNode(yaml.safe_load(f))
|
||||
with open(args.pwg_config) as f:
|
||||
pwg_config = CfgNode(yaml.safe_load(f))
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(fastspeech2_config)
|
||||
print(pwg_config)
|
||||
|
||||
evaluate(args, fastspeech2_config, pwg_config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,14 @@
|
|||
|
||||
#!/bin/bash
|
||||
|
||||
python3 synthesize.py \
|
||||
--fastspeech2-config=conf/default.yaml \
|
||||
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_153.pdz \
|
||||
--fastspeech2-stat=dump/train/speech_stats.npy \
|
||||
--pwg-config=parallel_wavegan_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--pwg-params=parallel_wavegan_baker_ckpt_0.4/pwg_generator.pdparams \
|
||||
--pwg-stat=parallel_wavegan_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=exp/default/test \
|
||||
--device="gpu" \
|
||||
--phones-dict=dump/phone_id_map.txt
|
|
@ -0,0 +1,159 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import soundfile as sf
|
||||
import yaml
|
||||
from yacs.config import CfgNode
|
||||
from parakeet.models.fastspeech2 import FastSpeech2
|
||||
from parakeet.models.fastspeech2 import FastSpeech2Inference
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator
|
||||
from parakeet.models.parallel_wavegan import PWGInference
|
||||
from parakeet.modules.normalizer import ZScore
|
||||
|
||||
from frontend import Frontend
|
||||
|
||||
|
||||
def evaluate(args, fastspeech2_config, pwg_config):
|
||||
# dataloader has been too verbose
|
||||
logging.getLogger("DataLoader").disabled = True
|
||||
|
||||
# construct dataset for evaluation
|
||||
sentences = []
|
||||
with open(args.text, 'rt') as f:
|
||||
for line in f:
|
||||
utt_id, sentence = line.strip().split()
|
||||
sentences.append((utt_id, sentence))
|
||||
|
||||
with open(args.phones_dict, "r") as f:
|
||||
phn_id = [line.strip().split() for line in f.readlines()]
|
||||
vocab_size = len(phn_id)
|
||||
print("vocab_size:", vocab_size)
|
||||
odim = fastspeech2_config.n_mels
|
||||
model = FastSpeech2(
|
||||
idim=vocab_size, odim=odim, **fastspeech2_config["model"])
|
||||
|
||||
model.set_state_dict(
|
||||
paddle.load(args.fastspeech2_checkpoint)["main_params"])
|
||||
model.eval()
|
||||
|
||||
vocoder = PWGGenerator(**pwg_config["generator_params"])
|
||||
vocoder.set_state_dict(paddle.load(args.pwg_params))
|
||||
vocoder.remove_weight_norm()
|
||||
vocoder.eval()
|
||||
print("model done!")
|
||||
|
||||
frontend = Frontend(args.phones_dict)
|
||||
print("frontend done!")
|
||||
|
||||
stat = np.load(args.fastspeech2_stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
fastspeech2_normalizer = ZScore(mu, std)
|
||||
|
||||
stat = np.load(args.pwg_stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
pwg_normalizer = ZScore(mu, std)
|
||||
|
||||
fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)
|
||||
pwg_inference = PWGInference(pwg_normalizer, vocoder)
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for utt_id, sentence in sentences:
|
||||
input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
|
||||
phone_ids = input_ids["phone_ids"]
|
||||
flags = 0
|
||||
for part_phone_ids in phone_ids:
|
||||
with paddle.no_grad():
|
||||
mel = fastspeech2_inference(part_phone_ids)
|
||||
temp_wav = pwg_inference(mel)
|
||||
if flags == 0:
|
||||
wav = temp_wav
|
||||
flags = 1
|
||||
else:
|
||||
wav = paddle.concat([wav, temp_wav])
|
||||
sf.write(
|
||||
str(output_dir / (utt_id + ".wav")),
|
||||
wav.numpy(),
|
||||
samplerate=fastspeech2_config.fs)
|
||||
print(f"{utt_id} done!")
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize with fastspeech2 & parallel wavegan.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-config", type=str, help="fastspeech2 config file.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-checkpoint",
|
||||
type=str,
|
||||
help="fastspeech2 checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"--fastspeech2-stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pwg-config", type=str, help="parallel wavegan config file.")
|
||||
parser.add_argument(
|
||||
"--pwg-params",
|
||||
type=str,
|
||||
help="parallel wavegan generator parameters to load.")
|
||||
parser.add_argument(
|
||||
"--pwg-stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--phones-dict",
|
||||
type=str,
|
||||
default="phone_id_map.txt",
|
||||
help="phone vocabulary file.")
|
||||
parser.add_argument(
|
||||
"--text",
|
||||
type=str,
|
||||
help="text to synthesize, a 'utt_id sentence' pair per line.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
with open(args.fastspeech2_config) as f:
|
||||
fastspeech2_config = CfgNode(yaml.safe_load(f))
|
||||
with open(args.pwg_config) as f:
|
||||
pwg_config = CfgNode(yaml.safe_load(f))
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(fastspeech2_config)
|
||||
print(pwg_config)
|
||||
|
||||
evaluate(args, fastspeech2_config, pwg_config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,14 @@
|
|||
|
||||
#!/bin/bash
|
||||
|
||||
python3 synthesize_e2e.py \
|
||||
--fastspeech2-config=conf/default.yaml \
|
||||
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_153.pdz \
|
||||
--fastspeech2-stat=dump/train/speech_stats.npy \
|
||||
--pwg-config=parallel_wavegan_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--pwg-params=parallel_wavegan_baker_ckpt_0.4/pwg_generator.pdparams \
|
||||
--pwg-stat=parallel_wavegan_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--text=../sentences.txt \
|
||||
--output-dir=exp/default/test_e2e \
|
||||
--device="gpu" \
|
||||
--phones-dict=dump/phone_id_map.txt
|
|
@ -0,0 +1,218 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import DataParallel
|
||||
from paddle import distributed as dist
|
||||
from paddle import nn
|
||||
from paddle.io import DataLoader, DistributedBatchSampler
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.models.fastspeech2 import FastSpeech2
|
||||
from parakeet.training.extensions.snapshot import Snapshot
|
||||
from parakeet.training.extensions.visualizer import VisualDL
|
||||
from parakeet.training.seeding import seed_everything
|
||||
from parakeet.training.trainer import Trainer
|
||||
from visualdl import LogWriter
|
||||
import yaml
|
||||
|
||||
from batch_fn import collate_baker_examples
|
||||
from config import get_cfg_default
|
||||
from fastspeech2_updater import FastSpeech2Evaluator
|
||||
from fastspeech2_updater import FastSpeech2Updater
|
||||
|
||||
optim_classes = dict(
|
||||
adadelta=paddle.optimizer.Adadelta,
|
||||
adagrad=paddle.optimizer.Adagrad,
|
||||
adam=paddle.optimizer.Adam,
|
||||
adamax=paddle.optimizer.Adamax,
|
||||
adamw=paddle.optimizer.AdamW,
|
||||
lamb=paddle.optimizer.Lamb,
|
||||
momentum=paddle.optimizer.Momentum,
|
||||
rmsprop=paddle.optimizer.RMSProp,
|
||||
sgd=paddle.optimizer.SGD, )
|
||||
|
||||
|
||||
def build_optimizers(model: nn.Layer, optim='adadelta',
|
||||
learning_rate=0.01) -> paddle.optimizer:
|
||||
optim_class = optim_classes.get(optim)
|
||||
if optim_class is None:
|
||||
raise ValueError(f"must be one of {list(optim_classes)}: {optim}")
|
||||
else:
|
||||
optim = optim_class(
|
||||
parameters=model.parameters(), learning_rate=learning_rate)
|
||||
|
||||
optimizers = optim
|
||||
return optimizers
|
||||
|
||||
|
||||
def train_sp(args, config):
|
||||
# decides device type and whether to run in parallel
|
||||
# setup running environment correctly
|
||||
if not paddle.is_compiled_with_cuda():
|
||||
paddle.set_device("cpu")
|
||||
else:
|
||||
paddle.set_device("gpu")
|
||||
world_size = paddle.distributed.get_world_size()
|
||||
if world_size > 1:
|
||||
paddle.distributed.init_parallel_env()
|
||||
|
||||
# set the random seed, it is a must for multiprocess training
|
||||
seed_everything(config.seed)
|
||||
|
||||
print(
|
||||
f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
|
||||
)
|
||||
|
||||
# dataloader has been too verbose
|
||||
logging.getLogger("DataLoader").disabled = True
|
||||
|
||||
# construct dataset for training and validation
|
||||
with jsonlines.open(args.train_metadata, 'r') as reader:
|
||||
train_metadata = list(reader)
|
||||
train_dataset = DataTable(
|
||||
data=train_metadata,
|
||||
fields=[
|
||||
"text", "text_lengths", "speech", "speech_lengths", "durations",
|
||||
"pitch", "energy"
|
||||
],
|
||||
converters={"speech": np.load,
|
||||
"pitch": np.load,
|
||||
"energy": np.load}, )
|
||||
with jsonlines.open(args.dev_metadata, 'r') as reader:
|
||||
dev_metadata = list(reader)
|
||||
dev_dataset = DataTable(
|
||||
data=dev_metadata,
|
||||
fields=[
|
||||
"text", "text_lengths", "speech", "speech_lengths", "durations",
|
||||
"pitch", "energy"
|
||||
],
|
||||
converters={"speech": np.load,
|
||||
"pitch": np.load,
|
||||
"energy": np.load}, )
|
||||
|
||||
# collate function and dataloader
|
||||
|
||||
train_sampler = DistributedBatchSampler(
|
||||
train_dataset,
|
||||
batch_size=config.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
|
||||
print("samplers done!")
|
||||
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset,
|
||||
batch_sampler=train_sampler,
|
||||
collate_fn=collate_baker_examples,
|
||||
num_workers=config.num_workers)
|
||||
|
||||
dev_dataloader = DataLoader(
|
||||
dev_dataset,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
batch_size=config.batch_size,
|
||||
collate_fn=collate_baker_examples,
|
||||
num_workers=config.num_workers)
|
||||
print("dataloaders done!")
|
||||
|
||||
with open(args.phones_dict, "r") as f:
|
||||
phn_id = [line.strip().split() for line in f.readlines()]
|
||||
vocab_size = len(phn_id)
|
||||
print("vocab_size:", vocab_size)
|
||||
|
||||
odim = config.n_mels
|
||||
model = FastSpeech2(idim=vocab_size, odim=odim, **config["model"])
|
||||
if world_size > 1:
|
||||
model = DataParallel(model)
|
||||
print("model done!")
|
||||
|
||||
optimizer = build_optimizers(model, **config["optimizer"])
|
||||
print("optimizer done!")
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
updater = FastSpeech2Updater(
|
||||
model=model,
|
||||
optimizer=optimizer,
|
||||
dataloader=train_dataloader,
|
||||
output_dir=output_dir,
|
||||
**config["updater"])
|
||||
|
||||
trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
|
||||
|
||||
evaluator = FastSpeech2Evaluator(
|
||||
model, dev_dataloader, output_dir=output_dir, **config["updater"])
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
trainer.extend(evaluator, trigger=(1, "epoch"))
|
||||
writer = LogWriter(str(output_dir))
|
||||
trainer.extend(VisualDL(writer), trigger=(1, "iteration"))
|
||||
trainer.extend(
|
||||
Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
|
||||
# print(trainer.extensions)
|
||||
trainer.run()
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(description="Train a FastSpeech2 "
|
||||
"model with Baker Mandrin TTS dataset.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data.")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--nprocs", type=int, default=1, help="number of processes.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
parser.add_argument(
|
||||
"--phones-dict",
|
||||
type=str,
|
||||
default="phone_id_map.txt ",
|
||||
help="phone vocabulary file.")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.device == "cpu" and args.nprocs > 1:
|
||||
raise RuntimeError("Multiprocess training on CPU is not supported.")
|
||||
config = get_cfg_default()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(config)
|
||||
print(
|
||||
f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
|
||||
)
|
||||
|
||||
# dispatch
|
||||
if args.nprocs > 1:
|
||||
dist.spawn(train_sp, (args, config), nprocs=args.nprocs)
|
||||
else:
|
||||
train_sp(args, config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,16 @@
|
|||
001 凯莫瑞安联合体的经济崩溃,迫在眉睫。
|
||||
002 对于所有想要离开那片废土,去寻找更美好生活的人来说。
|
||||
003 克哈,是你们所有人安全的港湾。
|
||||
004 为了保护尤摩扬人民不受异虫的残害,我所做的,比他们自己的领导委员会都多。
|
||||
005 无论他们如何诽谤我,我将继续为所有泰伦人的最大利益,而努力奋斗。
|
||||
006 身为你们的元首,我带领泰伦人实现了人类统治领地和经济的扩张。
|
||||
007 我们将继续成长,用行动回击那些只会说风凉话,不愿意和我们相向而行的害群之马。
|
||||
008 帝国武装力量,无数的优秀儿女,正时刻守卫着我们的家园大门,但是他们孤木难支。
|
||||
009 凡是今天应征入伍者,所获的所有刑罚罪责,减半。
|
||||
010 激进分子和异见者希望你们一听见枪声,就背弃多年的和平与繁荣。
|
||||
011 他们没有勇气和能力,带领人类穿越一个充满危险的星系。
|
||||
012 法治是我们的命脉,然而它却受到前所未有的挑战。
|
||||
013 我将恢复我们帝国的荣光,绝不会向任何外星势力低头。
|
||||
014 我已经驯服了异虫,荡平了星灵。如今它们的创造者,想要夺走我们拥有的一切。
|
||||
015 永远记住,谁才是最能保护你们的人。
|
||||
016 不要听信别人的谗言,我不是什么克隆人。
|
|
@ -30,9 +30,7 @@ except ModuleNotFoundError:
|
|||
INT16_MAX = (2**15) - 1
|
||||
|
||||
|
||||
def normalize_volume(wav,
|
||||
target_dBFS,
|
||||
increase_only=False,
|
||||
def normalize_volume(wav, target_dBFS, increase_only=False,
|
||||
decrease_only=False):
|
||||
# this function implements Loudness normalization, instead of peak
|
||||
# normalization, See https://en.wikipedia.org/wiki/Audio_normalization
|
||||
|
@ -44,8 +42,9 @@ def normalize_volume(wav,
|
|||
if increase_only and decrease_only:
|
||||
raise ValueError("Both increase only and decrease only are set")
|
||||
dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))
|
||||
if ((dBFS_change < 0 and increase_only) or
|
||||
(dBFS_change > 0 and decrease_only)):
|
||||
if dBFS_change < 0 and increase_only:
|
||||
return wav
|
||||
if dBFS_change > 0 and decrease_only:
|
||||
return wav
|
||||
gain = 10**(dBFS_change / 20)
|
||||
return wav * gain
|
||||
|
@ -59,9 +58,14 @@ def trim_long_silences(wav,
|
|||
"""
|
||||
Ensures that segments without voice in the waveform remain no longer than a
|
||||
threshold determined by the VAD parameters in params.py.
|
||||
|
||||
:param wav: the raw waveform as a numpy array of floats
|
||||
:return: the same waveform with silences trimmed away (length <= original wav length)
|
||||
Parameters
|
||||
----------
|
||||
wav : np.array
|
||||
the raw waveform as a numpy array of floats
|
||||
Returns
|
||||
----------
|
||||
np.array
|
||||
the same waveform with silences trimmed away (length <= original wav length)
|
||||
"""
|
||||
# Compute the voice detection window size
|
||||
samples_per_window = (vad_window_length * sampling_rate) // 1000
|
||||
|
@ -117,20 +121,25 @@ def compute_partial_slices(n_samples: int,
|
|||
|
||||
The returned ranges may be indexing further than the length of the waveform. It is
|
||||
recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
|
||||
Parameters
|
||||
----------
|
||||
n_samples : int
|
||||
the number of samples in the waveform.
|
||||
partial_utterance_n_frames : int
|
||||
the number of mel spectrogram frames in each partial utterance.
|
||||
|
||||
:param n_samples: the number of samples in the waveform
|
||||
:param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
|
||||
utterance
|
||||
:param min_pad_coverage: when reaching the last partial utterance, it may or may not have
|
||||
enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
|
||||
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
|
||||
it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
|
||||
utterance, this parameter is ignored so that the function always returns at least 1 slice.
|
||||
:param overlap: by how much the partial utterance should overlap. If set to 0, the partial
|
||||
utterances are entirely disjoint.
|
||||
:return: the waveform slices and mel spectrogram slices as lists of array slices. Index
|
||||
respectively the waveform and the mel spectrogram with these slices to obtain the partial
|
||||
utterances.
|
||||
min_pad_coverage : int
|
||||
when reaching the last partial utterance, it may or may not have enough frames.
|
||||
If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
|
||||
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
|
||||
it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
|
||||
utterance, this parameter is ignored so that the function always returns at least 1 slice.
|
||||
overlap : float
|
||||
by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint.
|
||||
Returns
|
||||
----------
|
||||
the waveform slices and mel spectrogram slices as lists of array slices.
|
||||
Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances.
|
||||
"""
|
||||
assert 0 <= overlap < 1
|
||||
assert 0 < min_pad_coverage <= 1
|
||||
|
@ -138,8 +147,8 @@ def compute_partial_slices(n_samples: int,
|
|||
# librosa's function to compute num_frames from num_samples
|
||||
n_frames = int(np.ceil((n_samples + 1) / hop_length))
|
||||
# frame shift between ajacent partials
|
||||
frame_step = max(
|
||||
1, int(np.round(partial_utterance_n_frames * (1 - overlap))))
|
||||
frame_step = max(1,
|
||||
int(np.round(partial_utterance_n_frames * (1 - overlap))))
|
||||
|
||||
# Compute the slices
|
||||
wav_slices, mel_slices = [], []
|
||||
|
|
|
@ -57,7 +57,7 @@ def _process_speaker(speaker_dir: Path,
|
|||
try:
|
||||
with sources_fpath.open("rt") as sources_file:
|
||||
existing_names = {line.split(",")[0] for line in sources_file}
|
||||
except:
|
||||
except Exception as e:
|
||||
existing_names = {}
|
||||
else:
|
||||
existing_names = {}
|
||||
|
@ -114,9 +114,7 @@ def process_librispeech(processor,
|
|||
output_dir, "*.flac", skip_existing)
|
||||
|
||||
|
||||
def process_voxceleb1(processor,
|
||||
datasets_root,
|
||||
output_dir,
|
||||
def process_voxceleb1(processor, datasets_root, output_dir,
|
||||
skip_existing=False):
|
||||
dataset_name = "VoxCeleb1"
|
||||
dataset_root = datasets_root / dataset_name
|
||||
|
@ -126,10 +124,7 @@ def process_voxceleb1(processor,
|
|||
metadata = [line.strip().split("\t") for line in metafile][1:]
|
||||
|
||||
# speaker id -> nationality
|
||||
nationalities = {
|
||||
line[0]: line[3]
|
||||
for line in metadata if line[-1] == "dev"
|
||||
}
|
||||
nationalities = {line[0]: line[3] for line in metadata if line[-1] == "dev"}
|
||||
keep_speaker_ids = [
|
||||
speaker_id for speaker_id, nationality in nationalities.items()
|
||||
if nationality.lower() in anglophone_nationalites
|
||||
|
@ -147,9 +142,7 @@ def process_voxceleb1(processor,
|
|||
output_dir, "*.wav", skip_existing)
|
||||
|
||||
|
||||
def process_voxceleb2(processor,
|
||||
datasets_root,
|
||||
output_dir,
|
||||
def process_voxceleb2(processor, datasets_root, output_dir,
|
||||
skip_existing=False):
|
||||
dataset_name = "VoxCeleb2"
|
||||
dataset_root = datasets_root / dataset_name
|
||||
|
@ -171,9 +164,7 @@ def process_aidatatang_200zh(processor,
|
|||
output_dir, "*.wav", skip_existing)
|
||||
|
||||
|
||||
def process_magicdata(processor,
|
||||
datasets_root,
|
||||
output_dir,
|
||||
def process_magicdata(processor, datasets_root, output_dir,
|
||||
skip_existing=False):
|
||||
dataset_name = "magicdata/train"
|
||||
dataset_root = datasets_root / dataset_name
|
||||
|
|
|
@ -52,7 +52,8 @@ if __name__ == "__main__":
|
|||
if not args.no_trim:
|
||||
try:
|
||||
import webrtcvad
|
||||
except:
|
||||
print(webrtcvad.__version__)
|
||||
except Exception as e:
|
||||
raise ModuleNotFoundError(
|
||||
"Package 'webrtcvad' not found. This package enables "
|
||||
"noise removal and is recommended. Please install and "
|
||||
|
@ -96,5 +97,5 @@ if __name__ == "__main__":
|
|||
|
||||
for dataset in args.datasets:
|
||||
print("Preprocessing %s" % dataset)
|
||||
preprocess_func[dataset](processor, args.datasets_root,
|
||||
args.output_dir, args.skip_existing)
|
||||
preprocess_func[dataset](processor, args.datasets_root, args.output_dir,
|
||||
args.skip_existing)
|
||||
|
|
|
@ -83,12 +83,11 @@ class Ge2eExperiment(ExperimentBase):
|
|||
self.logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
self.visualizer.add_scalar("train/loss", loss_value,
|
||||
self.iteration)
|
||||
self.visualizer.add_scalar("train/loss", loss_value, self.iteration)
|
||||
self.visualizer.add_scalar("train/eer", eer, self.iteration)
|
||||
self.visualizer.add_scalar(
|
||||
"param/w",
|
||||
float(self.model_core.similarity_weight), self.iteration)
|
||||
self.visualizer.add_scalar("param/w",
|
||||
float(self.model_core.similarity_weight),
|
||||
self.iteration)
|
||||
self.visualizer.add_scalar("param/b",
|
||||
float(self.model_core.similarity_bias),
|
||||
self.iteration)
|
||||
|
|
|
@ -0,0 +1,118 @@
|
|||
# Parallel WaveGAN with the Baker dataset
|
||||
|
||||
This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
|
||||
|
||||
## Preprocess the dataset
|
||||
|
||||
Download the dataset from the [official website of data-baker](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`.
|
||||
|
||||
Run the script for preprocessing.
|
||||
|
||||
```bash
|
||||
bash preprocess.sh
|
||||
```
|
||||
|
||||
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
|
||||
|
||||
```text
|
||||
dump
|
||||
├── dev
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
├── test
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
└── train
|
||||
├── norm
|
||||
├── raw
|
||||
└── stats.npy
|
||||
```
|
||||
|
||||
The dataset is split into 3 parts, namely train, dev and test, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains log magnitude of mel spectrogram of each utterances, while the norm folder contains normalized spectrogram. The statistics used to normalize the spectrogram is computed from the training set, which is located in `dump/train/stats.npy`.
|
||||
|
||||
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.
|
||||
|
||||
## Train the model
|
||||
|
||||
To train the model use the `run.sh`. It is an example script to run `train.py`.
|
||||
|
||||
```bash
|
||||
bash run.sh
|
||||
```
|
||||
|
||||
Or you can use the `train.py` directly. Here's the complete help message to run it.
|
||||
|
||||
```text
|
||||
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
|
||||
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
|
||||
[--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
|
||||
|
||||
Train a Parallel WaveGAN model with Baker Mandrin TTS dataset.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG config file to overwrite default config
|
||||
--train-metadata TRAIN_METADATA
|
||||
training data
|
||||
--dev-metadata DEV_METADATA
|
||||
dev data
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir
|
||||
--device DEVICE device type to use
|
||||
--nprocs NPROCS number of processes
|
||||
--verbose VERBOSE verbose
|
||||
```
|
||||
|
||||
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
|
||||
2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
|
||||
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
|
||||
4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
|
||||
5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
|
||||
|
||||
## Pretrained Models
|
||||
|
||||
Pretrained models can be downloaded here:
|
||||
1. Parallel WaveGAN checkpoint. [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip), which is used as a vocoder in the end-to-end inference script.
|
||||
|
||||
Parallel WaveGAN checkpoint contains files listed below.
|
||||
|
||||
```text
|
||||
pwg_baker_ckpt_0.4
|
||||
├── pwg_default.yaml # default config used to train parallel wavegan
|
||||
├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan
|
||||
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
|
||||
```
|
||||
|
||||
## Synthesize
|
||||
|
||||
When training is done or pretrained models are downloaded. You can run `synthesize.py` to synthsize.
|
||||
|
||||
```text
|
||||
usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
|
||||
[--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
|
||||
[--device DEVICE] [--verbose VERBOSE]
|
||||
|
||||
synthesize with parallel wavegan.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG config file to overwrite default config
|
||||
--checkpoint CHECKPOINT
|
||||
snapshot to load
|
||||
--test-metadata TEST_METADATA
|
||||
dev data
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir
|
||||
--device DEVICE device to run
|
||||
--verbose VERBOSE verbose
|
||||
```
|
||||
|
||||
1. `--config` is the extra configuration file to overwrite the default config. You should use the same config with which the model is trained.
|
||||
2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `/checkpoints` inside the training output directory. If you use the pretrained model, use the `pwg_snapshot_iter_400000.pdz`.
|
||||
3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
|
||||
4. `--output-dir` is the directory to save the synthesized audio files.
|
||||
5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported.
|
||||
|
||||
## Acknowledgement
|
||||
|
||||
We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
|
|
@ -0,0 +1,118 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
|
||||
class Clip(object):
|
||||
"""Collate functor for training vocoders.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
batch_max_steps=20480,
|
||||
hop_size=256,
|
||||
aux_context_window=0, ):
|
||||
"""Initialize customized collater for DataLoader.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
batch_max_steps : int
|
||||
The maximum length of input signal in batch.
|
||||
hop_size : int
|
||||
Hop size of auxiliary features.
|
||||
aux_context_window : int
|
||||
Context window size for auxiliary feature conv.
|
||||
|
||||
"""
|
||||
if batch_max_steps % hop_size != 0:
|
||||
batch_max_steps += -(batch_max_steps % hop_size)
|
||||
assert batch_max_steps % hop_size == 0
|
||||
self.batch_max_steps = batch_max_steps
|
||||
self.batch_max_frames = batch_max_steps // hop_size
|
||||
self.hop_size = hop_size
|
||||
self.aux_context_window = aux_context_window
|
||||
|
||||
# set useful values in random cutting
|
||||
self.start_offset = aux_context_window
|
||||
self.end_offset = -(self.batch_max_frames + aux_context_window)
|
||||
self.mel_threshold = self.batch_max_frames + 2 * aux_context_window
|
||||
|
||||
def __call__(self, examples):
|
||||
"""Convert into batch tensors.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
batch : list
|
||||
list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
|
||||
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Auxiliary feature batch (B, C, T'), where
|
||||
T = (T' - 2 * aux_context_window) * hop_size.
|
||||
Tensor
|
||||
Target signal batch (B, 1, T).
|
||||
|
||||
"""
|
||||
# check length
|
||||
examples = [
|
||||
self._adjust_length(b['wave'], b['feats']) for b in examples
|
||||
if b['feats'].shape[0] > self.mel_threshold
|
||||
]
|
||||
xs, cs = [b[0] for b in examples], [b[1] for b in examples]
|
||||
|
||||
# make batch with random cut
|
||||
c_lengths = [c.shape[0] for c in cs]
|
||||
start_frames = np.array([
|
||||
np.random.randint(self.start_offset, cl + self.end_offset)
|
||||
for cl in c_lengths
|
||||
])
|
||||
x_starts = start_frames * self.hop_size
|
||||
x_ends = x_starts + self.batch_max_steps
|
||||
|
||||
c_starts = start_frames - self.aux_context_window
|
||||
c_ends = start_frames + self.batch_max_frames + self.aux_context_window
|
||||
y_batch = np.stack(
|
||||
[x[start:end] for x, start, end in zip(xs, x_starts, x_ends)])
|
||||
c_batch = np.stack(
|
||||
[c[start:end] for c, start, end in zip(cs, c_starts, c_ends)])
|
||||
|
||||
# convert each batch to tensor, asuume that each item in batch has the same length
|
||||
y_batch = paddle.to_tensor(
|
||||
y_batch, dtype=paddle.float32).unsqueeze(1) # (B, 1, T)
|
||||
c_batch = paddle.to_tensor(
|
||||
c_batch, dtype=paddle.float32).transpose([0, 2, 1]) # (B, C, T')
|
||||
|
||||
return y_batch, c_batch
|
||||
|
||||
def _adjust_length(self, x, c):
|
||||
"""Adjust the audio and feature lengths.
|
||||
|
||||
Note
|
||||
-------
|
||||
Basically we assume that the length of x and c are adjusted
|
||||
through preprocessing stage, but if we use other library processed
|
||||
features, this process will be needed.
|
||||
|
||||
"""
|
||||
if len(x) < c.shape[1] * self.hop_size:
|
||||
x = np.pad(x, (0, c.shape[1] * self.hop_size - len(x)), mode="edge")
|
||||
|
||||
# check the legnth is valid
|
||||
assert len(x) == c.shape[
|
||||
0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})"
|
||||
|
||||
return x, c
|
|
@ -0,0 +1,104 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Calculate statistics of feature files."""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tqdm import tqdm
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
def main():
|
||||
"""Run preprocessing process."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Compute mean and variance of dumped raw features.")
|
||||
parser.add_argument(
|
||||
"--metadata", type=str, help="json file with id and file paths ")
|
||||
parser.add_argument(
|
||||
"--field-name",
|
||||
type=str,
|
||||
help="name of the field to compute statistics for.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="yaml format configuration file.")
|
||||
parser.add_argument(
|
||||
"--dumpdir",
|
||||
type=str,
|
||||
help="directory to save statistics. if not provided, "
|
||||
"stats will be saved in the above root directory. (default=None)")
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
type=int,
|
||||
default=1,
|
||||
help="logging level. higher is more logging. (default=1)")
|
||||
args = parser.parse_args()
|
||||
|
||||
# set logger
|
||||
if args.verbose > 1:
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||
)
|
||||
elif args.verbose > 0:
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||
)
|
||||
else:
|
||||
logging.basicConfig(
|
||||
level=logging.WARN,
|
||||
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||
)
|
||||
logging.warning('Skip DEBUG/INFO messages')
|
||||
|
||||
config = get_cfg_default()
|
||||
# load config
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
|
||||
# check directory existence
|
||||
if args.dumpdir is None:
|
||||
args.dumpdir = os.path.dirname(args.metadata)
|
||||
if not os.path.exists(args.dumpdir):
|
||||
os.makedirs(args.dumpdir)
|
||||
|
||||
with jsonlines.open(args.metadata, 'r') as reader:
|
||||
metadata = list(reader)
|
||||
dataset = DataTable(
|
||||
metadata,
|
||||
fields=[args.field_name],
|
||||
converters={args.field_name: np.load}, )
|
||||
logging.info(f"The number of files = {len(dataset)}.")
|
||||
|
||||
# calculate statistics
|
||||
scaler = StandardScaler()
|
||||
for datum in tqdm(dataset):
|
||||
# StandardScalar supports (*, num_features) by default
|
||||
scaler.partial_fit(datum[args.field_name])
|
||||
|
||||
stats = np.stack([scaler.mean_, scaler.scale_], axis=0)
|
||||
np.save(
|
||||
os.path.join(args.dumpdir, "stats.npy"),
|
||||
stats.astype(np.float32),
|
||||
allow_pickle=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,128 @@
|
|||
# This is the hyperparameter configuration file for Parallel WaveGAN.
|
||||
# Please make sure this is adjusted for the CSMSC dataset. If you want to
|
||||
# apply to the other dataset, you might need to carefully change some parameters.
|
||||
# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
|
||||
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
sr: 24000 # Sampling rate.
|
||||
n_fft: 2048 # FFT size (in samples).
|
||||
hop_length: 300 # Hop size (in samples).
|
||||
win_length: 1200 # Window length (in samples).
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
n_mels: 80 # Number of mel basis.
|
||||
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
|
||||
fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
|
||||
# global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
||||
trim_silence: false # Whether to trim the start and end of silence.
|
||||
top_db: 60 # Need to tune carefully if the recording is not good.
|
||||
trim_frame_length: 2048 # Frame size in trimming.(in samples)
|
||||
trim_hop_length: 512 # Hop size in trimming.(in samples)
|
||||
|
||||
###########################################################
|
||||
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
generator_params:
|
||||
in_channels: 1 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
kernel_size: 3 # Kernel size of dilated convolution.
|
||||
layers: 30 # Number of residual block layers.
|
||||
stacks: 3 # Number of stacks i.e., dilation cycles.
|
||||
residual_channels: 64 # Number of channels in residual conv.
|
||||
gate_channels: 128 # Number of channels in gated conv.
|
||||
skip_channels: 64 # Number of channels in skip conv.
|
||||
aux_channels: 80 # Number of channels for auxiliary feature conv.
|
||||
# Must be the same as num_mels.
|
||||
aux_context_window: 2 # Context window size for auxiliary feature.
|
||||
# If set to 2, previous 2 and future 2 frames will be considered.
|
||||
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
|
||||
bias: true # use bias in residual blocks
|
||||
use_weight_norm: true # Whether to use weight norm.
|
||||
# If set to true, it will be applied to all of the conv layers.
|
||||
use_causal_conv: false # use causal conv in residual blocks and upsample layers
|
||||
# upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
|
||||
upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size.
|
||||
interpolate_mode: "nearest" # upsample net interpolate mode
|
||||
freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis
|
||||
nonlinear_activation: null
|
||||
nonlinear_activation_params: {}
|
||||
|
||||
###########################################################
|
||||
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
discriminator_params:
|
||||
in_channels: 1 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
kernel_size: 3 # Number of output channels.
|
||||
layers: 10 # Number of conv layers.
|
||||
conv_channels: 64 # Number of chnn layers.
|
||||
bias: true # Whether to use bias parameter in conv.
|
||||
use_weight_norm: true # Whether to use weight norm.
|
||||
# If set to true, it will be applied to all of the conv layers.
|
||||
nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
|
||||
nonlinear_activation_params: # Nonlinear function parameters
|
||||
negative_slope: 0.2 # Alpha in LeakyReLU.
|
||||
|
||||
###########################################################
|
||||
# STFT LOSS SETTING #
|
||||
###########################################################
|
||||
stft_loss_params:
|
||||
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
|
||||
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
|
||||
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
|
||||
window: "hann" # Window function for STFT-based loss
|
||||
|
||||
###########################################################
|
||||
# ADVERSARIAL LOSS SETTING #
|
||||
###########################################################
|
||||
lambda_adv: 4.0 # Loss balancing coefficient.
|
||||
|
||||
###########################################################
|
||||
# DATA LOADER SETTING #
|
||||
###########################################################
|
||||
batch_size: 8 # Batch size.
|
||||
batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
|
||||
num_workers: 4 # Number of workers in Pytorch DataLoader.
|
||||
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
|
||||
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER & SCHEDULER SETTING #
|
||||
###########################################################
|
||||
generator_optimizer_params:
|
||||
epsilon: 1.0e-6 # Generator's epsilon.
|
||||
weight_decay: 0.0 # Generator's weight decay coefficient.
|
||||
generator_scheduler_params:
|
||||
learning_rate: 0.0001 # Generator's learning rate.
|
||||
step_size: 200000 # Generator's scheduler step size.
|
||||
gamma: 0.5 # Generator's scheduler gamma.
|
||||
# At each step size, lr will be multiplied by this parameter.
|
||||
generator_grad_norm: 10 # Generator's gradient norm.
|
||||
discriminator_optimizer_params:
|
||||
epsilon: 1.0e-6 # Discriminator's epsilon.
|
||||
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
||||
discriminator_scheduler_params:
|
||||
learning_rate: 0.00005 # Discriminator's learning rate.
|
||||
step_size: 200000 # Discriminator's scheduler step size.
|
||||
gamma: 0.5 # Discriminator's scheduler gamma.
|
||||
# At each step size, lr will be multiplied by this parameter.
|
||||
discriminator_grad_norm: 1 # Discriminator's gradient norm.
|
||||
|
||||
###########################################################
|
||||
# INTERVAL SETTING #
|
||||
###########################################################
|
||||
discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
|
||||
train_max_steps: 400000 # Number of training steps.
|
||||
save_interval_steps: 5000 # Interval steps to save checkpoint.
|
||||
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
||||
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
||||
num_snapshots: 10 # max number of snapshots to keep while training
|
||||
seed: 42 # random seed for paddle, random, and np.random
|
|
@ -0,0 +1,28 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from yacs.config import CfgNode as Configuration
|
||||
|
||||
config_path = (Path(__file__).parent / "conf" / "default.yaml").resolve()
|
||||
|
||||
with open(config_path, 'rt') as f:
|
||||
_C = yaml.safe_load(f)
|
||||
_C = Configuration(_C)
|
||||
|
||||
|
||||
def get_cfg_default():
|
||||
config = _C.clone()
|
||||
return config
|
|
@ -0,0 +1,142 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Normalize feature files and dump them."""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tqdm import tqdm
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
def main():
|
||||
"""Run preprocessing process."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--metadata",
|
||||
type=str,
|
||||
required=True,
|
||||
help="directory including feature files to be normalized. "
|
||||
"you need to specify either *-scp or rootdir.")
|
||||
parser.add_argument(
|
||||
"--dumpdir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="directory to dump normalized feature files.")
|
||||
parser.add_argument(
|
||||
"--stats", type=str, required=True, help="statistics file.")
|
||||
parser.add_argument(
|
||||
"--skip-wav-copy",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="whether to skip the copy of wav files.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="yaml format configuration file.")
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
type=int,
|
||||
default=1,
|
||||
help="logging level. higher is more logging. (default=1)")
|
||||
args = parser.parse_args()
|
||||
|
||||
# set logger
|
||||
if args.verbose > 1:
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||
)
|
||||
elif args.verbose > 0:
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||
)
|
||||
else:
|
||||
logging.basicConfig(
|
||||
level=logging.WARN,
|
||||
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||
)
|
||||
logging.warning('Skip DEBUG/INFO messages')
|
||||
|
||||
# load config
|
||||
config = get_cfg_default()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
|
||||
# check directory existence
|
||||
dumpdir = Path(args.dumpdir).resolve()
|
||||
dumpdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# get dataset
|
||||
with jsonlines.open(args.metadata, 'r') as reader:
|
||||
metadata = list(reader)
|
||||
dataset = DataTable(
|
||||
metadata,
|
||||
fields=["utt_id", "wave", "feats"],
|
||||
converters={
|
||||
'utt_id': None,
|
||||
'wave': None if args.skip_wav_copy else np.load,
|
||||
'feats': np.load,
|
||||
})
|
||||
logging.info(f"The number of files = {len(dataset)}.")
|
||||
|
||||
# restore scaler
|
||||
scaler = StandardScaler()
|
||||
scaler.mean_ = np.load(args.stats)[0]
|
||||
scaler.scale_ = np.load(args.stats)[1]
|
||||
|
||||
# from version 0.23.0, this information is needed
|
||||
scaler.n_features_in_ = scaler.mean_.shape[0]
|
||||
|
||||
# process each file
|
||||
output_metadata = []
|
||||
|
||||
for item in tqdm(dataset):
|
||||
utt_id = item['utt_id']
|
||||
wave = item['wave']
|
||||
mel = item['feats']
|
||||
# normalize
|
||||
mel = scaler.transform(mel)
|
||||
|
||||
# save
|
||||
mel_path = dumpdir / f"{utt_id}-feats.npy"
|
||||
np.save(mel_path, mel.astype(np.float32), allow_pickle=False)
|
||||
if not args.skip_wav_copy:
|
||||
wav_path = dumpdir / f"{utt_id}-wave.npy"
|
||||
np.save(wav_path, wave.astype(np.float32), allow_pickle=False)
|
||||
else:
|
||||
wav_path = wave
|
||||
output_metadata.append({
|
||||
'utt_id': utt_id,
|
||||
'wave': str(wav_path),
|
||||
'feats': str(mel_path),
|
||||
})
|
||||
output_metadata.sort(key=itemgetter('utt_id'))
|
||||
output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
|
||||
with jsonlines.open(output_metadata_path, 'w') as writer:
|
||||
for item in output_metadata:
|
||||
writer.write(item)
|
||||
logging.info(f"metadata dumped into {output_metadata_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,232 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from operator import itemgetter
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
|
||||
import argparse
|
||||
import jsonlines
|
||||
import librosa
|
||||
import logging
|
||||
import numpy as np
|
||||
import tqdm
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from parakeet.data.get_feats import LogMelFBank
|
||||
from pathlib import Path
|
||||
from praatio import tgio
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
def process_sentence(config: Dict[str, Any],
|
||||
fp: Path,
|
||||
alignment_fp: Path,
|
||||
output_dir: Path,
|
||||
mel_extractor=None):
|
||||
utt_id = fp.stem
|
||||
|
||||
# reading
|
||||
y, sr = librosa.load(str(fp), sr=config.sr) # resampling may occur
|
||||
assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
|
||||
assert np.abs(
|
||||
y).max() <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
|
||||
duration = librosa.get_duration(y, sr=sr)
|
||||
|
||||
# trim according to the alignment file
|
||||
alignment = tgio.openTextgrid(alignment_fp)
|
||||
intervals = alignment.tierDict[alignment.tierNameList[0]].entryList
|
||||
first, last = intervals[0], intervals[-1]
|
||||
start = 0
|
||||
end = last.end
|
||||
if first.label == "sil" and first.end < duration:
|
||||
start = first.end
|
||||
else:
|
||||
logging.warning(
|
||||
f" There is something wrong with the fisrt interval {first} in utterance: {utt_id}"
|
||||
)
|
||||
if last.label == "sil" and last.start < duration:
|
||||
end = last.start
|
||||
else:
|
||||
end = duration
|
||||
logging.warning(
|
||||
f" There is something wrong with the last interval {last} in utterance: {utt_id}"
|
||||
)
|
||||
# silence trimmed
|
||||
start, end = librosa.time_to_samples([first.end, last.start], sr=sr)
|
||||
y = y[start:end]
|
||||
|
||||
# energy based silence trimming
|
||||
if config.trim_silence:
|
||||
y, _ = librosa.effects.trim(
|
||||
y,
|
||||
top_db=config.top_db,
|
||||
frame_length=config.trim_frame_length,
|
||||
hop_length=config.trim_hop_length)
|
||||
|
||||
# extract mel feats
|
||||
logmel = mel_extractor.get_log_mel_fbank(y)
|
||||
|
||||
# adjust time to make num_samples == num_frames * hop_length
|
||||
num_frames = logmel.shape[0]
|
||||
if y.size < num_frames * config.hop_length:
|
||||
y = np.pad(
|
||||
y, (0, num_frames * config.hop_length - y.size), mode="reflect")
|
||||
else:
|
||||
y = y[:num_frames * config.hop_length]
|
||||
num_sample = y.shape[0]
|
||||
|
||||
mel_path = output_dir / (utt_id + "_feats.npy")
|
||||
wav_path = output_dir / (utt_id + "_wave.npy")
|
||||
np.save(wav_path, y) # (num_samples, )
|
||||
np.save(mel_path, logmel) # (num_frames, n_mels)
|
||||
record = {
|
||||
"utt_id": utt_id,
|
||||
"num_samples": num_sample,
|
||||
"num_frames": num_frames,
|
||||
"feats": str(mel_path.resolve()),
|
||||
"wave": str(wav_path.resolve()),
|
||||
}
|
||||
return record
|
||||
|
||||
|
||||
def process_sentences(config,
|
||||
fps: List[Path],
|
||||
alignment_fps: List[Path],
|
||||
output_dir: Path,
|
||||
mel_extractor=None,
|
||||
nprocs: int=1):
|
||||
if nprocs == 1:
|
||||
results = []
|
||||
for fp, alignment_fp in tqdm.tqdm(zip(fps, alignment_fps)):
|
||||
results.append(
|
||||
process_sentence(config, fp, alignment_fp, output_dir,
|
||||
mel_extractor))
|
||||
else:
|
||||
with ThreadPoolExecutor(nprocs) as pool:
|
||||
futures = []
|
||||
with tqdm.tqdm(total=len(fps)) as progress:
|
||||
for fp, alignment_fp in zip(fps, alignment_fps):
|
||||
future = pool.submit(process_sentence, config, fp,
|
||||
alignment_fp, output_dir,
|
||||
mel_extractor)
|
||||
future.add_done_callback(lambda p: progress.update())
|
||||
futures.append(future)
|
||||
|
||||
results = []
|
||||
for ft in futures:
|
||||
results.append(ft.result())
|
||||
|
||||
results.sort(key=itemgetter("utt_id"))
|
||||
with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
|
||||
for item in results:
|
||||
writer.write(item)
|
||||
print("Done")
|
||||
|
||||
|
||||
def main():
|
||||
# parse config and args
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess audio and then extract features .")
|
||||
parser.add_argument(
|
||||
"--rootdir", default=None, type=str, help="directory to baker dataset.")
|
||||
parser.add_argument(
|
||||
"--dumpdir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="directory to dump feature files.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="yaml format configuration file.")
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
type=int,
|
||||
default=1,
|
||||
help="logging level. higher is more logging. (default=1)")
|
||||
parser.add_argument(
|
||||
"--num_cpu", type=int, default=1, help="number of process.")
|
||||
args = parser.parse_args()
|
||||
|
||||
C = get_cfg_default()
|
||||
if args.config:
|
||||
C.merge_from_file(args.config)
|
||||
C.freeze()
|
||||
|
||||
if args.verbose > 1:
|
||||
print(vars(args))
|
||||
print(C)
|
||||
|
||||
root_dir = Path(args.rootdir).expanduser()
|
||||
dumpdir = Path(args.dumpdir).expanduser()
|
||||
dumpdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
wav_files = sorted(list((root_dir / "Wave").rglob("*.wav")))
|
||||
alignment_files = sorted(
|
||||
list((root_dir / "PhoneLabeling").rglob("*.interval")))
|
||||
|
||||
# split data into 3 sections
|
||||
num_train = 9800
|
||||
num_dev = 100
|
||||
|
||||
train_wav_files = wav_files[:num_train]
|
||||
dev_wav_files = wav_files[num_train:num_train + num_dev]
|
||||
test_wav_files = wav_files[num_train + num_dev:]
|
||||
|
||||
train_alignment_files = alignment_files[:num_train]
|
||||
dev_alignment_files = alignment_files[num_train:num_train + num_dev]
|
||||
test_alignment_files = alignment_files[num_train + num_dev:]
|
||||
|
||||
train_dump_dir = dumpdir / "train" / "raw"
|
||||
train_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
dev_dump_dir = dumpdir / "dev" / "raw"
|
||||
dev_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
test_dump_dir = dumpdir / "test" / "raw"
|
||||
test_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
mel_extractor = LogMelFBank(
|
||||
sr=C.sr,
|
||||
n_fft=C.n_fft,
|
||||
hop_length=C.hop_length,
|
||||
win_length=C.win_length,
|
||||
window=C.window,
|
||||
n_mels=C.n_mels,
|
||||
fmin=C.fmin,
|
||||
fmax=C.fmax)
|
||||
|
||||
# process for the 3 sections
|
||||
process_sentences(
|
||||
C,
|
||||
train_wav_files,
|
||||
train_alignment_files,
|
||||
train_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
process_sentences(
|
||||
C,
|
||||
dev_wav_files,
|
||||
dev_alignment_files,
|
||||
dev_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
process_sentences(
|
||||
C,
|
||||
test_wav_files,
|
||||
test_alignment_files,
|
||||
test_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,6 @@
|
|||
python preprocess.py --rootdir=~/datasets/BZNSYP/ --dumpdir=dump --num_cpu=20
|
||||
python compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" --dumpdir=dump/train
|
||||
|
||||
python normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/stats.npy
|
||||
python normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/stats.npy
|
||||
python normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/stats.npy
|
|
@ -0,0 +1,231 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
from typing import Dict
|
||||
|
||||
import paddle
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader
|
||||
from paddle.nn import Layer
|
||||
from paddle.optimizer import Optimizer
|
||||
from paddle.optimizer.lr import LRScheduler
|
||||
from parakeet.training.extensions.evaluator import StandardEvaluator
|
||||
from parakeet.training.reporter import report
|
||||
from parakeet.training.updaters.standard_updater import StandardUpdater
|
||||
from parakeet.training.updaters.standard_updater import UpdaterState
|
||||
from timer import timer
|
||||
logging.basicConfig(
|
||||
format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
|
||||
datefmt='[%Y-%m-%d %H:%M:%S]')
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class PWGUpdater(StandardUpdater):
|
||||
def __init__(self,
|
||||
models: Dict[str, Layer],
|
||||
optimizers: Dict[str, Optimizer],
|
||||
criterions: Dict[str, Layer],
|
||||
schedulers: Dict[str, LRScheduler],
|
||||
dataloader: DataLoader,
|
||||
discriminator_train_start_steps: int,
|
||||
lambda_adv: float,
|
||||
output_dir=None):
|
||||
self.models = models
|
||||
self.generator: Layer = models['generator']
|
||||
self.discriminator: Layer = models['discriminator']
|
||||
|
||||
self.optimizers = optimizers
|
||||
self.optimizer_g: Optimizer = optimizers['generator']
|
||||
self.optimizer_d: Optimizer = optimizers['discriminator']
|
||||
|
||||
self.criterions = criterions
|
||||
self.criterion_stft = criterions['stft']
|
||||
self.criterion_mse = criterions['mse']
|
||||
|
||||
self.schedulers = schedulers
|
||||
self.scheduler_g = schedulers['generator']
|
||||
self.scheduler_d = schedulers['discriminator']
|
||||
|
||||
self.dataloader = dataloader
|
||||
|
||||
self.discriminator_train_start_steps = discriminator_train_start_steps
|
||||
self.lambda_adv = lambda_adv
|
||||
self.state = UpdaterState(iteration=0, epoch=0)
|
||||
|
||||
self.train_iterator = iter(self.dataloader)
|
||||
|
||||
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
|
||||
self.filehandler = logging.FileHandler(str(log_file))
|
||||
logger.addHandler(self.filehandler)
|
||||
self.logger = logger
|
||||
self.msg = ""
|
||||
|
||||
def update_core(self, batch):
|
||||
self.msg = "Rank: {}, ".format(dist.get_rank())
|
||||
losses_dict = {}
|
||||
|
||||
# parse batch
|
||||
wav, mel = batch
|
||||
|
||||
# Generator
|
||||
noise = paddle.randn(wav.shape)
|
||||
|
||||
with timer() as t:
|
||||
wav_ = self.generator(noise, mel)
|
||||
# logging.debug(f"Generator takes {t.elapse}s.")
|
||||
|
||||
# initialize
|
||||
gen_loss = 0.0
|
||||
|
||||
## Multi-resolution stft loss
|
||||
with timer() as t:
|
||||
sc_loss, mag_loss = self.criterion_stft(wav_, wav)
|
||||
# logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s.")
|
||||
|
||||
report("train/spectral_convergence_loss", float(sc_loss))
|
||||
report("train/log_stft_magnitude_loss", float(mag_loss))
|
||||
|
||||
losses_dict["spectral_convergence_loss"] = float(sc_loss)
|
||||
losses_dict["log_stft_magnitude_loss"] = float(mag_loss)
|
||||
|
||||
gen_loss += sc_loss + mag_loss
|
||||
|
||||
## Adversarial loss
|
||||
if self.state.iteration > self.discriminator_train_start_steps:
|
||||
with timer() as t:
|
||||
p_ = self.discriminator(wav_)
|
||||
adv_loss = self.criterion_mse(p_, paddle.ones_like(p_))
|
||||
# logging.debug(
|
||||
# f"Discriminator and adversarial loss takes {t.elapse}s")
|
||||
report("train/adversarial_loss", float(adv_loss))
|
||||
losses_dict["adversarial_loss"] = float(adv_loss)
|
||||
gen_loss += self.lambda_adv * adv_loss
|
||||
|
||||
report("train/generator_loss", float(gen_loss))
|
||||
losses_dict["generator_loss"] = float(gen_loss)
|
||||
|
||||
with timer() as t:
|
||||
self.optimizer_g.clear_grad()
|
||||
gen_loss.backward()
|
||||
# logging.debug(f"Backward takes {t.elapse}s.")
|
||||
|
||||
with timer() as t:
|
||||
self.optimizer_g.step()
|
||||
self.scheduler_g.step()
|
||||
# logging.debug(f"Update takes {t.elapse}s.")
|
||||
|
||||
# Disctiminator
|
||||
if self.state.iteration > self.discriminator_train_start_steps:
|
||||
with paddle.no_grad():
|
||||
wav_ = self.generator(noise, mel)
|
||||
p = self.discriminator(wav)
|
||||
p_ = self.discriminator(wav_.detach())
|
||||
real_loss = self.criterion_mse(p, paddle.ones_like(p))
|
||||
fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_))
|
||||
dis_loss = real_loss + fake_loss
|
||||
report("train/real_loss", float(real_loss))
|
||||
report("train/fake_loss", float(fake_loss))
|
||||
report("train/discriminator_loss", float(dis_loss))
|
||||
losses_dict["real_loss"] = float(real_loss)
|
||||
losses_dict["fake_loss"] = float(fake_loss)
|
||||
losses_dict["discriminator_loss"] = float(dis_loss)
|
||||
|
||||
self.optimizer_d.clear_grad()
|
||||
dis_loss.backward()
|
||||
|
||||
self.optimizer_d.step()
|
||||
self.scheduler_d.step()
|
||||
|
||||
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_dict.items())
|
||||
|
||||
|
||||
class PWGEvaluator(StandardEvaluator):
|
||||
def __init__(self,
|
||||
models,
|
||||
criterions,
|
||||
dataloader,
|
||||
lambda_adv,
|
||||
output_dir=None):
|
||||
self.models = models
|
||||
self.generator = models['generator']
|
||||
self.discriminator = models['discriminator']
|
||||
|
||||
self.criterions = criterions
|
||||
self.criterion_stft = criterions['stft']
|
||||
self.criterion_mse = criterions['mse']
|
||||
|
||||
self.dataloader = dataloader
|
||||
self.lambda_adv = lambda_adv
|
||||
|
||||
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
|
||||
self.filehandler = logging.FileHandler(str(log_file))
|
||||
logger.addHandler(self.filehandler)
|
||||
self.logger = logger
|
||||
self.msg = ""
|
||||
|
||||
def evaluate_core(self, batch):
|
||||
# logging.debug("Evaluate: ")
|
||||
self.msg = "Evaluate: "
|
||||
losses_dict = {}
|
||||
|
||||
wav, mel = batch
|
||||
noise = paddle.randn(wav.shape)
|
||||
|
||||
with timer() as t:
|
||||
wav_ = self.generator(noise, mel)
|
||||
# logging.debug(f"Generator takes {t.elapse}s")
|
||||
|
||||
## Adversarial loss
|
||||
with timer() as t:
|
||||
p_ = self.discriminator(wav_)
|
||||
adv_loss = self.criterion_mse(p_, paddle.ones_like(p_))
|
||||
# logging.debug(
|
||||
# f"Discriminator and adversarial loss takes {t.elapse}s")
|
||||
report("eval/adversarial_loss", float(adv_loss))
|
||||
losses_dict["adversarial_loss"] = float(adv_loss)
|
||||
gen_loss = self.lambda_adv * adv_loss
|
||||
|
||||
# stft loss
|
||||
with timer() as t:
|
||||
sc_loss, mag_loss = self.criterion_stft(wav_, wav)
|
||||
# logging.debug(f"Multi-resolution STFT loss takes {t.elapse}s")
|
||||
|
||||
report("eval/spectral_convergence_loss", float(sc_loss))
|
||||
report("eval/log_stft_magnitude_loss", float(mag_loss))
|
||||
losses_dict["spectral_convergence_loss"] = float(sc_loss)
|
||||
losses_dict["log_stft_magnitude_loss"] = float(mag_loss)
|
||||
gen_loss += sc_loss + mag_loss
|
||||
|
||||
report("eval/generator_loss", float(gen_loss))
|
||||
losses_dict["generator_loss"] = float(gen_loss)
|
||||
|
||||
# Disctiminator
|
||||
p = self.discriminator(wav)
|
||||
real_loss = self.criterion_mse(p, paddle.ones_like(p))
|
||||
fake_loss = self.criterion_mse(p_, paddle.zeros_like(p_))
|
||||
dis_loss = real_loss + fake_loss
|
||||
report("eval/real_loss", float(real_loss))
|
||||
report("eval/fake_loss", float(fake_loss))
|
||||
report("eval/discriminator_loss", float(dis_loss))
|
||||
|
||||
losses_dict["real_loss"] = float(real_loss)
|
||||
losses_dict["fake_loss"] = float(fake_loss)
|
||||
losses_dict["discriminator_loss"] = float(dis_loss)
|
||||
|
||||
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_dict.items())
|
||||
self.logger.info(self.msg)
|
|
@ -0,0 +1,8 @@
|
|||
FLAGS_cudnn_exhaustive_search=true \
|
||||
FLAGS_conv_workspace_size_limit=4000 \
|
||||
python train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=conf/default.yaml \
|
||||
--output-dir=exp/default \
|
||||
--nprocs=1
|
|
@ -0,0 +1,90 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
from timer import timer
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
import paddle
|
||||
import soundfile as sf
|
||||
import yaml
|
||||
from paddle import distributed as dist
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize with parallel wavegan.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
|
||||
parser.add_argument("--test-metadata", type=str, help="dev data.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument("--device", type=str, default="gpu", help="device to run.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
config = get_cfg_default()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(config)
|
||||
print(
|
||||
f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
|
||||
)
|
||||
|
||||
paddle.set_device(args.device)
|
||||
generator = PWGGenerator(**config["generator_params"])
|
||||
state_dict = paddle.load(args.checkpoint)
|
||||
generator.set_state_dict(state_dict["generator_params"])
|
||||
|
||||
generator.remove_weight_norm()
|
||||
generator.eval()
|
||||
with jsonlines.open(args.test_metadata, 'r') as reader:
|
||||
metadata = list(reader)
|
||||
|
||||
test_dataset = DataTable(
|
||||
metadata,
|
||||
fields=['utt_id', 'feats'],
|
||||
converters={
|
||||
'utt_id': None,
|
||||
'feats': np.load,
|
||||
})
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
N = 0
|
||||
T = 0
|
||||
for example in test_dataset:
|
||||
utt_id = example['utt_id']
|
||||
mel = example['feats']
|
||||
mel = paddle.to_tensor(mel) # (T, C)
|
||||
with timer() as t:
|
||||
wav = generator.inference(c=mel)
|
||||
wav = wav.numpy()
|
||||
N += wav.size
|
||||
T += t.elapse
|
||||
speed = wav.size / t.elapse
|
||||
print(
|
||||
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {config.sr / speed}."
|
||||
)
|
||||
sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.sr)
|
||||
print(f"generation speed: {N / T}Hz, RTF: {config.sr / (N / T) }")
|
|
@ -0,0 +1,5 @@
|
|||
python3 synthesize.py \
|
||||
--config=conf/default.yaml \
|
||||
--checkpoint=exp/default/checkpoints/snapshot_iter_220000.pdz \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=exp/default/test
|
|
@ -0,0 +1,111 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import paddle
|
||||
import soundfile as sf
|
||||
import yaml
|
||||
from parakeet.data.get_feats import LogMelFBank
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
|
||||
from parakeet.modules.normalizer import ZScore
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
def evaluate(args, config):
|
||||
# dataloader has been too verbose
|
||||
logging.getLogger("DataLoader").disabled = True
|
||||
|
||||
vocoder = PWGGenerator(**config["generator_params"])
|
||||
state_dict = paddle.load(args.checkpoint)
|
||||
vocoder.set_state_dict(state_dict["generator_params"])
|
||||
vocoder.remove_weight_norm()
|
||||
vocoder.eval()
|
||||
print("model done!")
|
||||
|
||||
stat = np.load(args.stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
normalizer = ZScore(mu, std)
|
||||
|
||||
pwg_inference = PWGInference(normalizer, vocoder)
|
||||
|
||||
input_dir = Path(args.input_dir)
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
mel_extractor = LogMelFBank(
|
||||
sr=config.sr,
|
||||
n_fft=config.n_fft,
|
||||
hop_length=config.hop_length,
|
||||
win_length=config.win_length,
|
||||
window=config.window,
|
||||
n_mels=config.n_mels,
|
||||
fmin=config.fmin,
|
||||
fmax=config.fmax)
|
||||
|
||||
for utt_name in os.listdir(input_dir):
|
||||
wav, _ = librosa.load(str(input_dir / utt_name), sr=config.sr)
|
||||
# extract mel feats
|
||||
mel = mel_extractor.get_log_mel_fbank(wav)
|
||||
mel = paddle.to_tensor(mel)
|
||||
gen_wav = pwg_inference(mel)
|
||||
sf.write(
|
||||
str(output_dir / ("gen_" + utt_name)),
|
||||
gen_wav.numpy(),
|
||||
samplerate=config.sr)
|
||||
print(f"{utt_name} done!")
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize with parallel wavegan.")
|
||||
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
|
||||
parser.add_argument(
|
||||
"--stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
|
||||
)
|
||||
parser.add_argument("--input-dir", type=str, help="input dir of wavs.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device to run.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
config = get_cfg_default()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(config)
|
||||
|
||||
evaluate(args, config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,243 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import logging
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
import paddle
|
||||
import yaml
|
||||
from paddle import DataParallel
|
||||
from paddle import distributed as dist
|
||||
from paddle import nn
|
||||
from paddle.io import DataLoader
|
||||
from paddle.io import DistributedBatchSampler
|
||||
from paddle.optimizer import Adam # No RAdaom
|
||||
from paddle.optimizer.lr import StepDecay
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator
|
||||
from parakeet.models.parallel_wavegan import PWGDiscriminator
|
||||
from parakeet.modules.stft_loss import MultiResolutionSTFTLoss
|
||||
from parakeet.training.extensions.snapshot import Snapshot
|
||||
from parakeet.training.extensions.visualizer import VisualDL
|
||||
from parakeet.training.seeding import seed_everything
|
||||
from parakeet.training.trainer import Trainer
|
||||
from pathlib import Path
|
||||
from visualdl import LogWriter
|
||||
|
||||
from batch_fn import Clip
|
||||
from config import get_cfg_default
|
||||
from pwg_updater import PWGUpdater
|
||||
from pwg_updater import PWGEvaluator
|
||||
|
||||
|
||||
def train_sp(args, config):
|
||||
# decides device type and whether to run in parallel
|
||||
# setup running environment correctly
|
||||
world_size = paddle.distributed.get_world_size()
|
||||
if not paddle.is_compiled_with_cuda():
|
||||
paddle.set_device("cpu")
|
||||
else:
|
||||
paddle.set_device("gpu")
|
||||
if world_size > 1:
|
||||
paddle.distributed.init_parallel_env()
|
||||
|
||||
# set the random seed, it is a must for multiprocess training
|
||||
seed_everything(config.seed)
|
||||
|
||||
print(
|
||||
f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
|
||||
)
|
||||
|
||||
# dataloader has been too verbose
|
||||
logging.getLogger("DataLoader").disabled = True
|
||||
|
||||
# construct dataset for training and validation
|
||||
with jsonlines.open(args.train_metadata, 'r') as reader:
|
||||
train_metadata = list(reader)
|
||||
train_dataset = DataTable(
|
||||
data=train_metadata,
|
||||
fields=["wave", "feats"],
|
||||
converters={
|
||||
"wave": np.load,
|
||||
"feats": np.load,
|
||||
}, )
|
||||
with jsonlines.open(args.dev_metadata, 'r') as reader:
|
||||
dev_metadata = list(reader)
|
||||
dev_dataset = DataTable(
|
||||
data=dev_metadata,
|
||||
fields=["wave", "feats"],
|
||||
converters={
|
||||
"wave": np.load,
|
||||
"feats": np.load,
|
||||
}, )
|
||||
|
||||
# collate function and dataloader
|
||||
train_sampler = DistributedBatchSampler(
|
||||
train_dataset,
|
||||
batch_size=config.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
dev_sampler = DistributedBatchSampler(
|
||||
dev_dataset,
|
||||
batch_size=config.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False)
|
||||
print("samplers done!")
|
||||
|
||||
train_batch_fn = Clip(
|
||||
batch_max_steps=config.batch_max_steps,
|
||||
hop_size=config.hop_length,
|
||||
aux_context_window=config.generator_params.aux_context_window)
|
||||
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset,
|
||||
batch_sampler=train_sampler,
|
||||
collate_fn=train_batch_fn,
|
||||
num_workers=config.num_workers)
|
||||
|
||||
dev_dataloader = DataLoader(
|
||||
dev_dataset,
|
||||
batch_sampler=dev_sampler,
|
||||
collate_fn=train_batch_fn,
|
||||
num_workers=config.num_workers)
|
||||
print("dataloaders done!")
|
||||
|
||||
generator = PWGGenerator(**config["generator_params"])
|
||||
discriminator = PWGDiscriminator(**config["discriminator_params"])
|
||||
if world_size > 1:
|
||||
generator = DataParallel(generator)
|
||||
discriminator = DataParallel(discriminator)
|
||||
print("models done!")
|
||||
|
||||
criterion_stft = MultiResolutionSTFTLoss(**config["stft_loss_params"])
|
||||
criterion_mse = nn.MSELoss()
|
||||
print("criterions done!")
|
||||
|
||||
lr_schedule_g = StepDecay(**config["generator_scheduler_params"])
|
||||
gradient_clip_g = nn.ClipGradByGlobalNorm(config["generator_grad_norm"])
|
||||
optimizer_g = Adam(
|
||||
learning_rate=lr_schedule_g,
|
||||
grad_clip=gradient_clip_g,
|
||||
parameters=generator.parameters(),
|
||||
**config["generator_optimizer_params"])
|
||||
lr_schedule_d = StepDecay(**config["discriminator_scheduler_params"])
|
||||
gradient_clip_d = nn.ClipGradByGlobalNorm(config["discriminator_grad_norm"])
|
||||
optimizer_d = Adam(
|
||||
learning_rate=lr_schedule_d,
|
||||
grad_clip=gradient_clip_d,
|
||||
parameters=discriminator.parameters(),
|
||||
**config["discriminator_optimizer_params"])
|
||||
print("optimizers done!")
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
if dist.get_rank() == 0:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_dir / "config.yaml", 'wt') as f:
|
||||
f.write(config.dump(default_flow_style=None))
|
||||
|
||||
updater = PWGUpdater(
|
||||
models={
|
||||
"generator": generator,
|
||||
"discriminator": discriminator,
|
||||
},
|
||||
optimizers={
|
||||
"generator": optimizer_g,
|
||||
"discriminator": optimizer_d,
|
||||
},
|
||||
criterions={
|
||||
"stft": criterion_stft,
|
||||
"mse": criterion_mse,
|
||||
},
|
||||
schedulers={
|
||||
"generator": lr_schedule_g,
|
||||
"discriminator": lr_schedule_d,
|
||||
},
|
||||
dataloader=train_dataloader,
|
||||
discriminator_train_start_steps=config.discriminator_train_start_steps,
|
||||
lambda_adv=config.lambda_adv,
|
||||
output_dir=output_dir)
|
||||
|
||||
evaluator = PWGEvaluator(
|
||||
models={
|
||||
"generator": generator,
|
||||
"discriminator": discriminator,
|
||||
},
|
||||
criterions={
|
||||
"stft": criterion_stft,
|
||||
"mse": criterion_mse,
|
||||
},
|
||||
dataloader=dev_dataloader,
|
||||
lambda_adv=config.lambda_adv,
|
||||
output_dir=output_dir)
|
||||
trainer = Trainer(
|
||||
updater,
|
||||
stop_trigger=(config.train_max_steps, "iteration"),
|
||||
out=output_dir, )
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
trainer.extend(
|
||||
evaluator, trigger=(config.eval_interval_steps, 'iteration'))
|
||||
writer = LogWriter(str(trainer.out))
|
||||
trainer.extend(VisualDL(writer), trigger=(1, 'iteration'))
|
||||
trainer.extend(
|
||||
Snapshot(max_size=config.num_snapshots),
|
||||
trigger=(config.save_interval_steps, 'iteration'))
|
||||
|
||||
# print(trainer.extensions.keys())
|
||||
print("Trainer Done!")
|
||||
trainer.run()
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN "
|
||||
"model with Baker Mandrin TTS dataset.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data.")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--nprocs", type=int, default=1, help="number of processes.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.device == "cpu" and args.nprocs > 1:
|
||||
raise RuntimeError("Multiprocess training on CPU is not supported.")
|
||||
config = get_cfg_default()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(config)
|
||||
print(
|
||||
f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
|
||||
)
|
||||
|
||||
# dispatch
|
||||
if args.nprocs > 1:
|
||||
dist.spawn(train_sp, (args, config), nprocs=args.nprocs)
|
||||
else:
|
||||
train_sp(args, config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,141 @@
|
|||
# Speedyspeech with the Baker dataset
|
||||
|
||||
This example contains code used to train a [Speedyspeech](http://arxiv.org/abs/2008.03802) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). NOTE that we only implement the student part of the Speedyspeech model. The ground truth alignment used to train the model is extracted from the dataset.
|
||||
|
||||
## Preprocess the dataset
|
||||
|
||||
Download the dataset from the [official website of data-baker](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`.
|
||||
|
||||
Run the script for preprocessing.
|
||||
|
||||
```bash
|
||||
bash preprocess.sh
|
||||
```
|
||||
|
||||
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
|
||||
|
||||
```text
|
||||
dump
|
||||
├── dev
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
├── test
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
└── train
|
||||
├── norm
|
||||
├── raw
|
||||
└── stats.npy
|
||||
```
|
||||
|
||||
The dataset is split into 3 parts, namely train, dev and test, each of which contains a `norm` and `raw` sub folder. The raw folder contains log magnitude of mel spectrogram of each utterances, while the norm folder contains normalized spectrogram. The statistics used to normalize the spectrogram is computed from the training set, which is located in `dump/train/stats.npy`.
|
||||
|
||||
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, tones, durations, path of spectrogram, and id of each utterance.
|
||||
|
||||
## Train the model
|
||||
|
||||
To train the model use the `run.sh`. It is an example script to run `train.py`.
|
||||
|
||||
```bash
|
||||
bash run.sh
|
||||
```
|
||||
|
||||
Or you can use `train.py` directly. Here's the complete help message.
|
||||
|
||||
```text
|
||||
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
|
||||
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
|
||||
[--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
|
||||
|
||||
Train a Speedyspeech model with Baker Mandrin TTS dataset.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG config file to overwrite default config
|
||||
--train-metadata TRAIN_METADATA
|
||||
training data
|
||||
--dev-metadata DEV_METADATA
|
||||
dev data
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir
|
||||
--device DEVICE device type to use
|
||||
--nprocs NPROCS number of processes
|
||||
--verbose VERBOSE verbose
|
||||
```
|
||||
|
||||
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
|
||||
2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
|
||||
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
|
||||
4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
|
||||
5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
|
||||
|
||||
## Pretrained Models
|
||||
|
||||
Pretrained models can be downloaded here:
|
||||
1. Speedyspeech checkpoint. [speedyspeech_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_baker_ckpt_0.4.zip)
|
||||
2. Parallel WaveGAN checkpoint. [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip), which is used as a vocoder in the end-to-end inference script.
|
||||
|
||||
Speedyspeech checkpoint contains files listed below.
|
||||
|
||||
```text
|
||||
speedyspeech_baker_ckpt_0.4
|
||||
├── speedyspeech_default.yaml # default config used to train speedyseech
|
||||
├── speedy_speech_stats.npy # statistics used to normalize spectrogram when training speedyspeech
|
||||
└── speedyspeech_snapshot_iter_91800.pdz # model parameters and optimizer states
|
||||
```
|
||||
|
||||
Parallel WaveGAN checkpoint contains files listed below.
|
||||
|
||||
```text
|
||||
pwg_baker_ckpt_0.4
|
||||
├── pwg_default.yaml # default config used to train parallel wavegan
|
||||
├── pwg_snapshot_iter_400000.pdz # model parameters and optimizer states of parallel wavegan
|
||||
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
|
||||
```
|
||||
|
||||
## Synthesize End to End
|
||||
|
||||
When training is done or pretrained models are downloaded. You can run `synthesize_e2e.py` to synthsize.
|
||||
|
||||
```text
|
||||
usage: synthesize_e2e.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG]
|
||||
[--speedyspeech-checkpoint SPEEDYSPEECH_CHECKPOINT]
|
||||
[--speedyspeech-stat SPEEDYSPEECH_STAT]
|
||||
[--pwg-config PWG_CONFIG] [--pwg-checkpoint PWG_CHECKPOINT]
|
||||
[--pwg-stat PWG_STAT] [--text TEXT]
|
||||
[--output-dir OUTPUT_DIR]
|
||||
[--inference-dir INFERENCE_DIR] [--device DEVICE]
|
||||
[--verbose VERBOSE]
|
||||
|
||||
Synthesize with speedyspeech & parallel wavegan.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--speedyspeech-config SPEEDYSPEECH_CONFIG
|
||||
config file for speedyspeech.
|
||||
--speedyspeech-checkpoint SPEEDYSPEECH_CHECKPOINT
|
||||
speedyspeech checkpoint to load.
|
||||
--speedyspeech-stat SPEEDYSPEECH_STAT
|
||||
mean and standard deviation used to normalize
|
||||
spectrogram when training speedyspeech.
|
||||
--pwg-config PWG_CONFIG
|
||||
config file for parallelwavegan.
|
||||
--pwg-checkpoint PWG_CHECKPOINT
|
||||
parallel wavegan checkpoint to load.
|
||||
--pwg-stat PWG_STAT mean and standard deviation used to normalize
|
||||
spectrogram when training speedyspeech.
|
||||
--text TEXT text to synthesize, a 'utt_id sentence' pair per line
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir
|
||||
--inference-dir INFERENCE_DIR
|
||||
dir to save inference models
|
||||
--device DEVICE device type to use
|
||||
--verbose VERBOSE verbose
|
||||
```
|
||||
|
||||
1. `--speedyspeech-config`, `--speedyspeech-checkpoint`, `--speedyspeech-stat` are arguments for speedyspeech, which correspond to the 3 files in the speedyspeech pretrained model.
|
||||
2. `--pwg-config`, `--pwg-checkpoint`, `--pwg-stat` are arguments for speedyspeech, which correspond to the 3 files in the parallel wavegan pretrained model.
|
||||
3. `--text` is the text file, which contains sentences to synthesize.
|
||||
4. `--output-dir` is the directory to save synthesized audio files.
|
||||
5. `--inference-dir` is the directory to save exported model, which can be used with paddle infernece.
|
||||
6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
|
|
@ -0,0 +1,42 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
from parakeet.data.batch import batch_sequences
|
||||
|
||||
|
||||
def collate_baker_examples(examples):
|
||||
# fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"]
|
||||
phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
|
||||
tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
|
||||
feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
|
||||
durations = [
|
||||
np.array(item["durations"], dtype=np.int64) for item in examples
|
||||
]
|
||||
num_phones = np.array([item["num_phones"] for item in examples])
|
||||
num_frames = np.array([item["num_frames"] for item in examples])
|
||||
|
||||
phones = batch_sequences(phones)
|
||||
tones = batch_sequences(tones)
|
||||
feats = batch_sequences(feats)
|
||||
durations = batch_sequences(durations)
|
||||
batch = {
|
||||
"phones": phones,
|
||||
"tones": tones,
|
||||
"num_phones": num_phones,
|
||||
"num_frames": num_frames,
|
||||
"feats": feats,
|
||||
"durations": durations,
|
||||
}
|
||||
return batch
|
|
@ -0,0 +1,107 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Calculate statistics of feature files."""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tqdm import tqdm
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
def main():
|
||||
"""Run preprocessing process."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Compute mean and variance of dumped raw features.")
|
||||
parser.add_argument(
|
||||
"--metadata", type=str, help="json file with id and file paths ")
|
||||
parser.add_argument(
|
||||
"--field-name",
|
||||
type=str,
|
||||
help="name of the field to compute statistics for.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="yaml format configuration file.")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
help="path to save statistics. if not provided, "
|
||||
"stats will be saved in the above root directory with name stats.npy")
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
type=int,
|
||||
default=1,
|
||||
help="logging level. higher is more logging. (default=1)")
|
||||
args = parser.parse_args()
|
||||
|
||||
# set logger
|
||||
if args.verbose > 1:
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||
)
|
||||
elif args.verbose > 0:
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||
)
|
||||
else:
|
||||
logging.basicConfig(
|
||||
level=logging.WARN,
|
||||
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||
)
|
||||
logging.warning('Skip DEBUG/INFO messages')
|
||||
|
||||
config = get_cfg_default()
|
||||
# load config
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
|
||||
# check directory existence
|
||||
if args.output is None:
|
||||
args.output = Path(args.metadata).parent.with_name("stats.npy")
|
||||
else:
|
||||
args.output = Path(args.output)
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with jsonlines.open(args.metadata, 'r') as reader:
|
||||
metadata = list(reader)
|
||||
|
||||
metadata_dir = Path(args.metadata).parent
|
||||
for item in metadata:
|
||||
item["feats"] = str(metadata_dir / item["feats"])
|
||||
|
||||
dataset = DataTable(
|
||||
metadata,
|
||||
fields=[args.field_name],
|
||||
converters={args.field_name: np.load}, )
|
||||
logging.info(f"The number of files = {len(dataset)}.")
|
||||
|
||||
# calculate statistics
|
||||
scaler = StandardScaler()
|
||||
for datum in tqdm(dataset):
|
||||
# StandardScalar supports (*, num_features) by default
|
||||
scaler.partial_fit(datum[args.field_name])
|
||||
|
||||
stats = np.stack([scaler.mean_, scaler.scale_], axis=0)
|
||||
np.save(str(args.output), stats.astype(np.float32), allow_pickle=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,60 @@
|
|||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
sr: 24000 # Sampling rate.
|
||||
n_fft: 2048 # FFT size.
|
||||
hop_length: 300 # Hop size.
|
||||
win_length: 1200 # Window length.
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
n_mels: 80 # Number of mel basis.
|
||||
fmin: 80 # Minimum freq in mel basis calculation.
|
||||
fmax: 7600 # Maximum frequency in mel basis calculation.
|
||||
# global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
||||
trim_silence: false # Whether to trim the start and end of silence.
|
||||
top_db: 60 # Need to tune carefully if the recording is not good.
|
||||
trim_frame_length: 2048 # Frame size in trimming.(in samples)
|
||||
trim_hop_length: 512 # Hop size in trimming.(in samples)
|
||||
|
||||
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
batch_size: 32
|
||||
num_workers: 4
|
||||
|
||||
|
||||
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model:
|
||||
vocab_size: 101 # 99 + 2
|
||||
tone_size: 8 # 6 + 2
|
||||
encoder_hidden_size: 128
|
||||
encoder_kernel_size: 3
|
||||
encoder_dilations: [1, 3, 9, 27, 1, 3, 9, 27, 1, 1]
|
||||
duration_predictor_hidden_size: 128
|
||||
decoder_hidden_size: 128
|
||||
decoder_output_size: 80
|
||||
decoder_kernel_size: 3
|
||||
decoder_dilations: [1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 1]
|
||||
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 300
|
||||
num_snapshots: 5
|
||||
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
seed: 10086
|
|
@ -0,0 +1,28 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import yaml
|
||||
from yacs.config import CfgNode as Configuration
|
||||
from pathlib import Path
|
||||
|
||||
config_path = (Path(__file__).parent / "conf" / "default.yaml").resolve()
|
||||
|
||||
with open(config_path, 'rt') as f:
|
||||
_C = yaml.safe_load(f)
|
||||
_C = Configuration(_C)
|
||||
|
||||
|
||||
def get_cfg_default():
|
||||
config = _C.clone()
|
||||
return config
|
|
@ -0,0 +1,94 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
import jieba
|
||||
import phkit
|
||||
phkit.initialize()
|
||||
from parakeet.frontend.vocab import Vocab
|
||||
|
||||
file_dir = Path(__file__).parent.resolve()
|
||||
with open(file_dir / "phones.txt", 'rt') as f:
|
||||
phones = [line.strip() for line in f.readlines()]
|
||||
|
||||
with open(file_dir / "tones.txt", 'rt') as f:
|
||||
tones = [line.strip() for line in f.readlines()]
|
||||
voc_phones = Vocab(phones, start_symbol=None, end_symbol=None)
|
||||
voc_tones = Vocab(tones, start_symbol=None, end_symbol=None)
|
||||
|
||||
|
||||
def segment(sentence):
|
||||
segments = re.split(r'[:,;。?!]', sentence)
|
||||
segments = [seg for seg in segments if len(seg)]
|
||||
return segments
|
||||
|
||||
|
||||
def g2p(sentence):
|
||||
segments = segment(sentence)
|
||||
phones = []
|
||||
phones.append('sil')
|
||||
tones = []
|
||||
tones.append('0')
|
||||
|
||||
for seg in segments:
|
||||
seg = jieba.lcut(seg)
|
||||
initials = lazy_pinyin(
|
||||
seg, neutral_tone_with_five=True, style=Style.INITIALS)
|
||||
finals = lazy_pinyin(
|
||||
seg, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
||||
for c, v in zip(initials, finals):
|
||||
# NOTE: post process for pypinyin outputs
|
||||
# we discriminate i, ii and iii
|
||||
if re.match(r'i\d', v):
|
||||
if c in ['z', 'c', 's']:
|
||||
v = re.sub('i', 'ii', v)
|
||||
elif c in ['zh', 'ch', 'sh', 'r']:
|
||||
v = re.sub('i', 'iii', v)
|
||||
if c:
|
||||
phones.append(c)
|
||||
tones.append('0')
|
||||
if v:
|
||||
phones.append(v[:-1])
|
||||
tones.append(v[-1])
|
||||
phones.append('sp')
|
||||
tones.append('0')
|
||||
phones[-1] = 'sil'
|
||||
tones[-1] = '0'
|
||||
return (phones, tones)
|
||||
|
||||
|
||||
def p2id(voc, phonemes):
|
||||
phone_ids = [voc.lookup(item) for item in phonemes]
|
||||
return np.array(phone_ids, np.int64)
|
||||
|
||||
|
||||
def t2id(voc, tones):
|
||||
tone_ids = [voc.lookup(item) for item in tones]
|
||||
return np.array(tone_ids, np.int64)
|
||||
|
||||
|
||||
def text_analysis(sentence):
|
||||
phonemes, tones = g2p(sentence)
|
||||
print(sentence)
|
||||
print([p + t if t != '0' else p for p, t in zip(phonemes, tones)])
|
||||
phone_ids = p2id(voc_phones, phonemes)
|
||||
tone_ids = t2id(voc_tones, tones)
|
||||
phones = paddle.to_tensor(phone_ids)
|
||||
tones = paddle.to_tensor(tone_ids)
|
||||
return phones, tones
|
|
@ -0,0 +1,129 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import soundfile as sf
|
||||
from paddle import inference
|
||||
|
||||
from frontend import text_analysis
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Paddle Infernce with speedyspeech & parallel wavegan.")
|
||||
parser.add_argument(
|
||||
"--inference-dir", type=str, help="dir to save inference models")
|
||||
parser.add_argument(
|
||||
"--text",
|
||||
type=str,
|
||||
help="text to synthesize, a 'utt_id sentence' pair per line")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
parser.add_argument(
|
||||
"--enable-auto-log", action="store_true", help="use auto log")
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
speedyspeech_config = inference.Config(
|
||||
str(Path(args.inference_dir) / "speedyspeech.pdmodel"),
|
||||
str(Path(args.inference_dir) / "speedyspeech.pdiparams"))
|
||||
speedyspeech_config.enable_use_gpu(100, 0)
|
||||
speedyspeech_config.enable_memory_optim()
|
||||
speedyspeech_predictor = inference.create_predictor(speedyspeech_config)
|
||||
|
||||
pwg_config = inference.Config(
|
||||
str(Path(args.inference_dir) / "pwg.pdmodel"),
|
||||
str(Path(args.inference_dir) / "pwg.pdiparams"))
|
||||
pwg_config.enable_use_gpu(100, 0)
|
||||
pwg_config.enable_memory_optim()
|
||||
pwg_predictor = inference.create_predictor(pwg_config)
|
||||
|
||||
if args.enable_auto_log:
|
||||
import auto_log
|
||||
os.makedirs("output", exist_ok=True)
|
||||
pid = os.getpid()
|
||||
logger = auto_log.AutoLogger(
|
||||
model_name="speedyspeech",
|
||||
model_precision='float32',
|
||||
batch_size=1,
|
||||
data_shape="dynamic",
|
||||
save_path="./output/auto_log.log",
|
||||
inference_config=speedyspeech_config,
|
||||
pids=pid,
|
||||
process_name=None,
|
||||
gpu_ids=0,
|
||||
time_keys=['preprocess_time', 'inference_time', 'postprocess_time'],
|
||||
warmup=0)
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
sentences = []
|
||||
with open(args.text, 'rt') as f:
|
||||
for line in f:
|
||||
utt_id, sentence = line.strip().split()
|
||||
sentences.append((utt_id, sentence))
|
||||
|
||||
for utt_id, sentence in sentences:
|
||||
if args.enable_auto_log:
|
||||
logger.times.start()
|
||||
|
||||
phones, tones = text_analysis(sentence)
|
||||
phones = phones.numpy()
|
||||
tones = tones.numpy()
|
||||
|
||||
if args.enable_auto_log:
|
||||
logger.times.stamp()
|
||||
|
||||
input_names = speedyspeech_predictor.get_input_names()
|
||||
phones_handle = speedyspeech_predictor.get_input_handle(input_names[0])
|
||||
tones_handle = speedyspeech_predictor.get_input_handle(input_names[1])
|
||||
|
||||
phones_handle.reshape(phones.shape)
|
||||
phones_handle.copy_from_cpu(phones)
|
||||
tones_handle.reshape(tones.shape)
|
||||
tones_handle.copy_from_cpu(tones)
|
||||
|
||||
speedyspeech_predictor.run()
|
||||
output_names = speedyspeech_predictor.get_output_names()
|
||||
output_handle = speedyspeech_predictor.get_output_handle(
|
||||
output_names[0])
|
||||
output_data = output_handle.copy_to_cpu()
|
||||
|
||||
input_names = pwg_predictor.get_input_names()
|
||||
mel_handle = pwg_predictor.get_input_handle(input_names[0])
|
||||
mel_handle.reshape(output_data.shape)
|
||||
mel_handle.copy_from_cpu(output_data)
|
||||
|
||||
pwg_predictor.run()
|
||||
output_names = pwg_predictor.get_output_names()
|
||||
output_handle = pwg_predictor.get_output_handle(output_names[0])
|
||||
wav = output_data = output_handle.copy_to_cpu()
|
||||
|
||||
if args.enable_auto_log:
|
||||
logger.times.stamp()
|
||||
|
||||
sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000)
|
||||
|
||||
if args.enable_auto_log:
|
||||
logger.times.end(stamp=True)
|
||||
print(f"{utt_id} done!")
|
||||
|
||||
if args.enable_auto_log:
|
||||
logger.report()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,5 @@
|
|||
python inference.py \
|
||||
--inference-dir=exp/default/inference \
|
||||
--text=sentences.txt \
|
||||
--output-dir=exp/default/pd_infer_out
|
||||
|
|
@ -0,0 +1,154 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Normalize feature files and dump them."""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tqdm import tqdm
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.frontend.vocab import Vocab
|
||||
|
||||
from config import get_cfg_default
|
||||
|
||||
|
||||
def main():
|
||||
"""Run preprocessing process."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--metadata",
|
||||
type=str,
|
||||
required=True,
|
||||
help="directory including feature files to be normalized. "
|
||||
"you need to specify either *-scp or rootdir.")
|
||||
parser.add_argument(
|
||||
"--dumpdir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="directory to dump normalized feature files.")
|
||||
parser.add_argument(
|
||||
"--stats", type=str, required=True, help="statistics file.")
|
||||
parser.add_argument(
|
||||
"--phones",
|
||||
type=str,
|
||||
default="phones.txt",
|
||||
help="phone vocabulary file.")
|
||||
parser.add_argument(
|
||||
"--tones", type=str, default="tones.txt", help="tone vocabulary file.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="yaml format configuration file.")
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
type=int,
|
||||
default=1,
|
||||
help="logging level. higher is more logging. (default=1)")
|
||||
args = parser.parse_args()
|
||||
|
||||
# set logger
|
||||
if args.verbose > 1:
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||
)
|
||||
elif args.verbose > 0:
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||
)
|
||||
else:
|
||||
logging.basicConfig(
|
||||
level=logging.WARN,
|
||||
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||
)
|
||||
logging.warning('Skip DEBUG/INFO messages')
|
||||
|
||||
# load config
|
||||
config = get_cfg_default()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
|
||||
# check directory existence
|
||||
dumpdir = Path(args.dumpdir).resolve()
|
||||
dumpdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# get dataset
|
||||
with jsonlines.open(args.metadata, 'r') as reader:
|
||||
metadata = list(reader)
|
||||
metadata_dir = Path(args.metadata).parent
|
||||
for item in metadata:
|
||||
item["feats"] = str(metadata_dir / item["feats"])
|
||||
|
||||
dataset = DataTable(
|
||||
metadata, converters={
|
||||
'feats': np.load,
|
||||
})
|
||||
logging.info(f"The number of files = {len(dataset)}.")
|
||||
|
||||
# restore scaler
|
||||
scaler = StandardScaler()
|
||||
scaler.mean_ = np.load(args.stats)[0]
|
||||
scaler.scale_ = np.load(args.stats)[1]
|
||||
|
||||
# from version 0.23.0, this information is needed
|
||||
scaler.n_features_in_ = scaler.mean_.shape[0]
|
||||
|
||||
with open(args.phones, 'rt') as f:
|
||||
phones = [line.strip() for line in f.readlines()]
|
||||
|
||||
with open(args.tones, 'rt') as f:
|
||||
tones = [line.strip() for line in f.readlines()]
|
||||
voc_phones = Vocab(phones, start_symbol=None, end_symbol=None)
|
||||
voc_tones = Vocab(tones, start_symbol=None, end_symbol=None)
|
||||
|
||||
# process each file
|
||||
output_metadata = []
|
||||
|
||||
for item in tqdm(dataset):
|
||||
utt_id = item['utt_id']
|
||||
mel = item['feats']
|
||||
# normalize
|
||||
mel = scaler.transform(mel)
|
||||
|
||||
# save
|
||||
mel_path = dumpdir / f"{utt_id}-feats.npy"
|
||||
np.save(mel_path, mel.astype(np.float32), allow_pickle=False)
|
||||
phone_ids = [voc_phones.lookup(p) for p in item['phones']]
|
||||
tone_ids = [voc_tones.lookup(t) for t in item['tones']]
|
||||
output_metadata.append({
|
||||
'utt_id': utt_id,
|
||||
'phones': phone_ids,
|
||||
'tones': tone_ids,
|
||||
'num_phones': item['num_phones'],
|
||||
'num_frames': item['num_frames'],
|
||||
'durations': item['durations'],
|
||||
'feats': str(mel_path.relative_to(dumpdir)),
|
||||
})
|
||||
output_metadata.sort(key=itemgetter('utt_id'))
|
||||
output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
|
||||
with jsonlines.open(output_metadata_path, 'w') as writer:
|
||||
for item in output_metadata:
|
||||
writer.write(item)
|
||||
logging.info(f"metadata dumped into {output_metadata_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,99 @@
|
|||
b
|
||||
p
|
||||
m
|
||||
f
|
||||
d
|
||||
t
|
||||
n
|
||||
l
|
||||
g
|
||||
k
|
||||
h
|
||||
zh
|
||||
ch
|
||||
sh
|
||||
r
|
||||
z
|
||||
c
|
||||
s
|
||||
j
|
||||
q
|
||||
x
|
||||
a
|
||||
ar
|
||||
ai
|
||||
air
|
||||
ao
|
||||
aor
|
||||
an
|
||||
anr
|
||||
ang
|
||||
angr
|
||||
e
|
||||
er
|
||||
ei
|
||||
eir
|
||||
en
|
||||
enr
|
||||
eng
|
||||
engr
|
||||
o
|
||||
or
|
||||
ou
|
||||
our
|
||||
ong
|
||||
ongr
|
||||
ii
|
||||
iir
|
||||
iii
|
||||
iiir
|
||||
i
|
||||
ir
|
||||
ia
|
||||
iar
|
||||
iao
|
||||
iaor
|
||||
ian
|
||||
ianr
|
||||
iang
|
||||
iangr
|
||||
ie
|
||||
ier
|
||||
io
|
||||
ior
|
||||
iou
|
||||
iour
|
||||
iong
|
||||
iongr
|
||||
in
|
||||
inr
|
||||
ing
|
||||
ingr
|
||||
u
|
||||
ur
|
||||
ua
|
||||
uar
|
||||
uai
|
||||
uair
|
||||
uan
|
||||
uanr
|
||||
uang
|
||||
uangr
|
||||
uei
|
||||
ueir
|
||||
uo
|
||||
uor
|
||||
uen
|
||||
uenr
|
||||
ueng
|
||||
uengr
|
||||
v
|
||||
vr
|
||||
ve
|
||||
ver
|
||||
van
|
||||
vanr
|
||||
vn
|
||||
vnr
|
||||
sil
|
||||
sp
|
|
@ -0,0 +1,260 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from operator import itemgetter
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
|
||||
import argparse
|
||||
import jsonlines
|
||||
import librosa
|
||||
import logging
|
||||
import numpy as np
|
||||
import re
|
||||
import tqdm
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from parakeet.data.get_feats import LogMelFBank
|
||||
from pathlib import Path
|
||||
from praatio import tgio
|
||||
|
||||
from config import get_cfg_default
|
||||
from tg_utils import validate_textgrid
|
||||
|
||||
|
||||
def process_sentence(config: Dict[str, Any],
|
||||
fp: Path,
|
||||
alignment_fp: Path,
|
||||
output_dir: Path,
|
||||
mel_extractor=None):
|
||||
utt_id = fp.stem
|
||||
|
||||
# reading
|
||||
y, sr = librosa.load(str(fp), sr=config.sr) # resampling may occur
|
||||
assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
|
||||
assert np.abs(
|
||||
y).max() <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
|
||||
duration = librosa.get_duration(y, sr=sr)
|
||||
|
||||
# intervals with empty lables are ignored
|
||||
alignment = tgio.openTextgrid(alignment_fp)
|
||||
|
||||
# validate text grid against audio file
|
||||
num_samples = y.shape[0]
|
||||
validate_textgrid(alignment, num_samples, sr)
|
||||
|
||||
# only with baker's annotation
|
||||
intervals = alignment.tierDict[alignment.tierNameList[0]].entryList
|
||||
|
||||
first, last = intervals[0], intervals[-1]
|
||||
if not (first.label == "sil" and first.end < duration):
|
||||
logging.warning(
|
||||
f" There is something wrong with the fisrt interval {first} in utterance: {utt_id}"
|
||||
)
|
||||
if not (last.label == "sil" and last.start < duration):
|
||||
logging.warning(
|
||||
f" There is something wrong with the last interval {last} in utterance: {utt_id}"
|
||||
)
|
||||
|
||||
# extract mel feats
|
||||
logmel = mel_extractor.get_log_mel_fbank(y)
|
||||
|
||||
# extract phone and duration
|
||||
phones = []
|
||||
tones = []
|
||||
ends = []
|
||||
durations_sec = []
|
||||
|
||||
for interval in intervals:
|
||||
label = interval.label
|
||||
label = label.replace("sp1", "sp") # Baker has sp1 rather than sp
|
||||
|
||||
# split tone from finals
|
||||
match = re.match(r'^(\w+)([012345])$', label)
|
||||
if match:
|
||||
phones.append(match.group(1))
|
||||
tones.append(match.group(2))
|
||||
else:
|
||||
phones.append(label)
|
||||
tones.append('0')
|
||||
end = min(duration, interval.end)
|
||||
ends.append(end)
|
||||
durations_sec.append(end - interval.start) # duration in seconds
|
||||
|
||||
frame_pos = librosa.time_to_frames(
|
||||
ends, sr=sr, hop_length=config.hop_length)
|
||||
durations_frame = np.diff(frame_pos, prepend=0)
|
||||
|
||||
num_frames = logmel.shape[0] # number of frames of the spectrogram
|
||||
extra = np.sum(durations_frame) - num_frames
|
||||
assert extra <= 0, (
|
||||
f"Number of frames inferred from alignemnt is "
|
||||
f"larger than number of frames of the spectrogram by {extra} frames")
|
||||
durations_frame[-1] += (-extra)
|
||||
|
||||
assert np.sum(durations_frame) == num_frames
|
||||
durations_frame = durations_frame.tolist()
|
||||
|
||||
mel_path = output_dir / (utt_id + "_feats.npy")
|
||||
np.save(mel_path, logmel) # (num_frames, n_mels)
|
||||
record = {
|
||||
"utt_id": utt_id,
|
||||
"phones": phones,
|
||||
"tones": tones,
|
||||
"num_phones": len(phones),
|
||||
"num_frames": num_frames,
|
||||
"durations": durations_frame,
|
||||
"feats": mel_path, # Path object
|
||||
}
|
||||
return record
|
||||
|
||||
|
||||
def process_sentences(config,
|
||||
fps: List[Path],
|
||||
alignment_fps: List[Path],
|
||||
output_dir: Path,
|
||||
mel_extractor=None,
|
||||
nprocs: int=1):
|
||||
if nprocs == 1:
|
||||
results = []
|
||||
for fp, alignment_fp in tqdm.tqdm(
|
||||
zip(fps, alignment_fps), total=len(fps)):
|
||||
results.append(
|
||||
process_sentence(config, fp, alignment_fp, output_dir,
|
||||
mel_extractor))
|
||||
else:
|
||||
with ThreadPoolExecutor(nprocs) as pool:
|
||||
futures = []
|
||||
with tqdm.tqdm(total=len(fps)) as progress:
|
||||
for fp, alignment_fp in zip(fps, alignment_fps):
|
||||
future = pool.submit(process_sentence, config, fp,
|
||||
alignment_fp, output_dir,
|
||||
mel_extractor)
|
||||
future.add_done_callback(lambda p: progress.update())
|
||||
futures.append(future)
|
||||
|
||||
results = []
|
||||
for ft in futures:
|
||||
results.append(ft.result())
|
||||
|
||||
results.sort(key=itemgetter("utt_id"))
|
||||
output_dir = Path(output_dir)
|
||||
metadata_path = output_dir / "metadata.jsonl"
|
||||
# NOTE: use relative path to the meta jsonlines file
|
||||
with jsonlines.open(metadata_path, 'w') as writer:
|
||||
for item in results:
|
||||
item["feats"] = str(item["feats"].relative_to(output_dir))
|
||||
writer.write(item)
|
||||
print("Done")
|
||||
|
||||
|
||||
def main():
|
||||
# parse config and args
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess audio and then extract features.")
|
||||
parser.add_argument(
|
||||
"--rootdir", default=None, type=str, help="directory to baker dataset.")
|
||||
parser.add_argument(
|
||||
"--dumpdir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="directory to dump feature files.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="yaml format configuration file.")
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
type=int,
|
||||
default=1,
|
||||
help="logging level. higher is more logging. (default=1)")
|
||||
parser.add_argument(
|
||||
"--num_cpu", type=int, default=1, help="number of process.")
|
||||
args = parser.parse_args()
|
||||
|
||||
C = get_cfg_default()
|
||||
if args.config:
|
||||
C.merge_from_file(args.config)
|
||||
C.freeze()
|
||||
|
||||
if args.verbose > 1:
|
||||
print(vars(args))
|
||||
print(C)
|
||||
|
||||
root_dir = Path(args.rootdir).expanduser()
|
||||
dumpdir = Path(args.dumpdir).expanduser()
|
||||
dumpdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
wav_files = sorted(list((root_dir / "Wave").rglob("*.wav")))
|
||||
alignment_files = sorted(
|
||||
list((root_dir / "PhoneLabeling").rglob("*.interval")))
|
||||
|
||||
# filter out several files that have errors in annotation
|
||||
exclude = {'000611', '000662', '002365', '005107'}
|
||||
wav_files = [f for f in wav_files if f.stem not in exclude]
|
||||
alignment_files = [f for f in alignment_files if f.stem not in exclude]
|
||||
|
||||
# split data into 3 sections
|
||||
num_train = 9800
|
||||
num_dev = 100
|
||||
|
||||
train_wav_files = wav_files[:num_train]
|
||||
dev_wav_files = wav_files[num_train:num_train + num_dev]
|
||||
test_wav_files = wav_files[num_train + num_dev:]
|
||||
|
||||
train_alignment_files = alignment_files[:num_train]
|
||||
dev_alignment_files = alignment_files[num_train:num_train + num_dev]
|
||||
test_alignment_files = alignment_files[num_train + num_dev:]
|
||||
|
||||
train_dump_dir = dumpdir / "train" / "raw"
|
||||
train_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
dev_dump_dir = dumpdir / "dev" / "raw"
|
||||
dev_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
test_dump_dir = dumpdir / "test" / "raw"
|
||||
test_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
mel_extractor = LogMelFBank(
|
||||
sr=C.sr,
|
||||
n_fft=C.n_fft,
|
||||
hop_length=C.hop_length,
|
||||
win_length=C.win_length,
|
||||
window=C.window,
|
||||
n_mels=C.n_mels,
|
||||
fmin=C.fmin,
|
||||
fmax=C.fmax)
|
||||
|
||||
# process for the 3 sections
|
||||
process_sentences(
|
||||
C,
|
||||
train_wav_files,
|
||||
train_alignment_files,
|
||||
train_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
process_sentences(
|
||||
C,
|
||||
dev_wav_files,
|
||||
dev_alignment_files,
|
||||
dev_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
process_sentences(
|
||||
C,
|
||||
test_wav_files,
|
||||
test_alignment_files,
|
||||
test_dump_dir,
|
||||
mel_extractor=mel_extractor,
|
||||
nprocs=args.num_cpu)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,6 @@
|
|||
python preprocess.py --rootdir=~/datasets/BZNSYP/ --dumpdir=dump --num_cpu=20
|
||||
python compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" --output=dump/train/stats.npy
|
||||
|
||||
python normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/stats.npy
|
||||
python normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/stats.npy
|
||||
python normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/stats.npy
|
|
@ -0,0 +1,6 @@
|
|||
python train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=conf/default.yaml \
|
||||
--output-dir=exp/default \
|
||||
--nprocs=1
|
|
@ -0,0 +1,16 @@
|
|||
001 凯莫瑞安联合体的经济崩溃,迫在眉睫。
|
||||
002 对于所有想要离开那片废土,去寻找更美好生活的人来说。
|
||||
003 克哈,是你们所有人安全的港湾。
|
||||
004 为了保护尤摩扬人民不受异虫的残害,我所做的,比他们自己的领导委员会都多。
|
||||
005 无论他们如何诽谤我,我将继续为所有泰伦人的最大利益,而努力奋斗。
|
||||
006 身为你们的元首,我带领泰伦人实现了人类统治领地和经济的扩张。
|
||||
007 我们将继续成长,用行动回击那些只会说风凉话,不愿意和我们相向而行的害群之马。
|
||||
008 帝国武装力量,无数的优秀儿女,正时刻守卫着我们的家园大门,但是他们孤木难支。
|
||||
009 凡是今天应征入伍者,所获的所有刑罚罪责,减半。
|
||||
010 激进分子和异见者希望你们一听见枪声,就背弃多年的和平与繁荣。
|
||||
011 他们没有勇气和能力,带领人类穿越一个充满危险的星系。
|
||||
012 法治是我们的命脉,然而它却受到前所未有的挑战。
|
||||
013 我将恢复我们帝国的荣光,绝不会向任何外星势力低头。
|
||||
014 我已经驯服了异虫,荡平了星灵。如今它们的创造者,想要夺走我们拥有的一切。
|
||||
015 永远记住,谁才是最能保护你们的人。
|
||||
016 不要听信别人的谗言,我不是什么克隆人。
|
|
@ -0,0 +1,158 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import logging
|
||||
|
||||
import paddle
|
||||
from paddle import distributed as dist
|
||||
from paddle.fluid.layers import huber_loss
|
||||
from paddle.nn import functional as F
|
||||
from parakeet.modules.losses import masked_l1_loss, weighted_mean
|
||||
from parakeet.modules.ssim import ssim
|
||||
from parakeet.training.extensions.evaluator import StandardEvaluator
|
||||
from parakeet.training.reporter import report
|
||||
from parakeet.training.updaters.standard_updater import StandardUpdater
|
||||
logging.basicConfig(
|
||||
format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
|
||||
datefmt='[%Y-%m-%d %H:%M:%S]')
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class SpeedySpeechUpdater(StandardUpdater):
|
||||
def __init__(self,
|
||||
model,
|
||||
optimizer,
|
||||
dataloader,
|
||||
init_state=None,
|
||||
output_dir=None):
|
||||
super().__init__(model, optimizer, dataloader, init_state=None)
|
||||
|
||||
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
|
||||
self.filehandler = logging.FileHandler(str(log_file))
|
||||
logger.addHandler(self.filehandler)
|
||||
self.logger = logger
|
||||
self.msg = ""
|
||||
|
||||
def update_core(self, batch):
|
||||
self.msg = "Rank: {}, ".format(dist.get_rank())
|
||||
losses_dict = {}
|
||||
|
||||
decoded, predicted_durations = self.model(
|
||||
text=batch["phones"],
|
||||
tones=batch["tones"],
|
||||
plens=batch["num_phones"],
|
||||
durations=batch["durations"])
|
||||
|
||||
target_mel = batch["feats"]
|
||||
spec_mask = F.sequence_mask(
|
||||
batch["num_frames"], dtype=target_mel.dtype).unsqueeze(-1)
|
||||
text_mask = F.sequence_mask(
|
||||
batch["num_phones"], dtype=predicted_durations.dtype)
|
||||
|
||||
# spec loss
|
||||
l1_loss = masked_l1_loss(decoded, target_mel, spec_mask)
|
||||
|
||||
# duration loss
|
||||
target_durations = batch["durations"]
|
||||
target_durations = paddle.maximum(
|
||||
target_durations.astype(predicted_durations.dtype),
|
||||
paddle.to_tensor([1.0]))
|
||||
duration_loss = weighted_mean(
|
||||
huber_loss(
|
||||
predicted_durations, paddle.log(target_durations), delta=1.0),
|
||||
text_mask, )
|
||||
|
||||
# ssim loss
|
||||
ssim_loss = 1.0 - ssim((decoded * spec_mask).unsqueeze(1),
|
||||
(target_mel * spec_mask).unsqueeze(1))
|
||||
|
||||
loss = l1_loss + ssim_loss + duration_loss
|
||||
|
||||
optimizer = self.optimizer
|
||||
optimizer.clear_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
report("train/loss", float(loss))
|
||||
report("train/l1_loss", float(l1_loss))
|
||||
report("train/duration_loss", float(duration_loss))
|
||||
report("train/ssim_loss", float(ssim_loss))
|
||||
|
||||
losses_dict["l1_loss"] = float(l1_loss)
|
||||
losses_dict["duration_loss"] = float(duration_loss)
|
||||
losses_dict["ssim_loss"] = float(ssim_loss)
|
||||
losses_dict["loss"] = float(loss)
|
||||
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_dict.items())
|
||||
|
||||
|
||||
class SpeedySpeechEvaluator(StandardEvaluator):
|
||||
def __init__(self, model, dataloader, output_dir=None):
|
||||
super().__init__(model, dataloader)
|
||||
|
||||
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
|
||||
self.filehandler = logging.FileHandler(str(log_file))
|
||||
logger.addHandler(self.filehandler)
|
||||
self.logger = logger
|
||||
self.msg = ""
|
||||
|
||||
def evaluate_core(self, batch):
|
||||
self.msg = "Evaluate: "
|
||||
losses_dict = {}
|
||||
|
||||
decoded, predicted_durations = self.model(
|
||||
text=batch["phones"],
|
||||
tones=batch["tones"],
|
||||
plens=batch["num_phones"],
|
||||
durations=batch["durations"])
|
||||
|
||||
target_mel = batch["feats"]
|
||||
spec_mask = F.sequence_mask(
|
||||
batch["num_frames"], dtype=target_mel.dtype).unsqueeze(-1)
|
||||
text_mask = F.sequence_mask(
|
||||
batch["num_phones"], dtype=predicted_durations.dtype)
|
||||
|
||||
# spec loss
|
||||
l1_loss = masked_l1_loss(decoded, target_mel, spec_mask)
|
||||
|
||||
# duration loss
|
||||
target_durations = batch["durations"]
|
||||
target_durations = paddle.maximum(
|
||||
target_durations.astype(predicted_durations.dtype),
|
||||
paddle.to_tensor([1.0]))
|
||||
duration_loss = weighted_mean(
|
||||
huber_loss(
|
||||
predicted_durations, paddle.log(target_durations), delta=1.0),
|
||||
text_mask, )
|
||||
|
||||
# ssim loss
|
||||
ssim_loss = 1.0 - ssim((decoded * spec_mask).unsqueeze(1),
|
||||
(target_mel * spec_mask).unsqueeze(1))
|
||||
|
||||
loss = l1_loss + ssim_loss + duration_loss
|
||||
|
||||
# import pdb; pdb.set_trace()
|
||||
|
||||
report("eval/loss", float(loss))
|
||||
report("eval/l1_loss", float(l1_loss))
|
||||
report("eval/duration_loss", float(duration_loss))
|
||||
report("eval/ssim_loss", float(ssim_loss))
|
||||
|
||||
losses_dict["l1_loss"] = float(l1_loss)
|
||||
losses_dict["duration_loss"] = float(duration_loss)
|
||||
losses_dict["ssim_loss"] = float(ssim_loss)
|
||||
losses_dict["loss"] = float(loss)
|
||||
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_dict.items())
|
||||
self.logger.info(self.msg)
|
|
@ -0,0 +1,161 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
import logging
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import paddle
|
||||
import yaml
|
||||
from paddle import jit
|
||||
from paddle.static import InputSpec
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.models.speedyspeech import SpeedySpeech
|
||||
from parakeet.models.speedyspeech import SpeedySpeechInference
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator
|
||||
from parakeet.models.parallel_wavegan import PWGInference
|
||||
from parakeet.modules.normalizer import ZScore
|
||||
|
||||
|
||||
def evaluate(args, speedyspeech_config, pwg_config):
|
||||
# dataloader has been too verbose
|
||||
logging.getLogger("DataLoader").disabled = True
|
||||
|
||||
# construct dataset for evaluation
|
||||
with jsonlines.open(args.test_metadata, 'r') as reader:
|
||||
test_metadata = list(reader)
|
||||
test_dataset = DataTable(
|
||||
data=test_metadata, fields=["utt_id", "phones", "tones"])
|
||||
|
||||
model = SpeedySpeech(**speedyspeech_config["model"])
|
||||
model.set_state_dict(
|
||||
paddle.load(args.speedyspeech_checkpoint)["main_params"])
|
||||
model.eval()
|
||||
|
||||
vocoder = PWGGenerator(**pwg_config["generator_params"])
|
||||
vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"])
|
||||
vocoder.remove_weight_norm()
|
||||
vocoder.eval()
|
||||
print("model done!")
|
||||
|
||||
stat = np.load(args.speedyspeech_stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
speedyspeech_normalizer = ZScore(mu, std)
|
||||
speedyspeech_normalizer.eval()
|
||||
|
||||
stat = np.load(args.pwg_stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
pwg_normalizer = ZScore(mu, std)
|
||||
pwg_normalizer.eval()
|
||||
|
||||
speedyspeech_inference = SpeedySpeechInference(speedyspeech_normalizer,
|
||||
model)
|
||||
speedyspeech_inference.eval()
|
||||
speedyspeech_inference = jit.to_static(
|
||||
speedyspeech_inference,
|
||||
input_spec=[
|
||||
InputSpec([-1], dtype=paddle.int64), InputSpec(
|
||||
[-1], dtype=paddle.int64)
|
||||
])
|
||||
paddle.jit.save(speedyspeech_inference,
|
||||
os.path.join(args.inference_dir, "speedyspeech"))
|
||||
speedyspeech_inference = paddle.jit.load(
|
||||
os.path.join(args.inference_dir, "speedyspeech"))
|
||||
|
||||
pwg_inference = PWGInference(pwg_normalizer, vocoder)
|
||||
pwg_inference.eval()
|
||||
pwg_inference = jit.to_static(
|
||||
pwg_inference, input_spec=[
|
||||
InputSpec([-1, 80], dtype=paddle.float32),
|
||||
])
|
||||
paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg"))
|
||||
pwg_inference = paddle.jit.load(os.path.join(args.inference_dir, "pwg"))
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for datum in test_dataset:
|
||||
utt_id = datum["utt_id"]
|
||||
phones = paddle.to_tensor(datum["phones"])
|
||||
tones = paddle.to_tensor(datum["tones"])
|
||||
|
||||
with paddle.no_grad():
|
||||
wav = pwg_inference(speedyspeech_inference(phones, tones))
|
||||
sf.write(
|
||||
output_dir / (utt_id + ".wav"),
|
||||
wav.numpy(),
|
||||
samplerate=speedyspeech_config.sr)
|
||||
print(f"{utt_id} done!")
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize with speedyspeech & parallel wavegan.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-config", type=str, help="config file for speedyspeech.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-checkpoint",
|
||||
type=str,
|
||||
help="speedyspeech checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pwg-config", type=str, help="config file for parallelwavegan.")
|
||||
parser.add_argument(
|
||||
"--pwg-checkpoint",
|
||||
type=str,
|
||||
help="parallel wavegan generator parameters to load.")
|
||||
parser.add_argument(
|
||||
"--pwg-stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
|
||||
)
|
||||
parser.add_argument("--test-metadata", type=str, help="test metadata")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
parser.add_argument(
|
||||
"--inference-dir", type=str, help="dir to save inference models")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
with open(args.speedyspeech_config) as f:
|
||||
speedyspeech_config = CfgNode(yaml.safe_load(f))
|
||||
with open(args.pwg_config) as f:
|
||||
pwg_config = CfgNode(yaml.safe_load(f))
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(speedyspeech_config)
|
||||
print(pwg_config)
|
||||
|
||||
evaluate(args, speedyspeech_config, pwg_config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,11 @@
|
|||
python synthesize.py \
|
||||
--speedyspeech-config=conf/default.yaml \
|
||||
--speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_91800.pdz \
|
||||
--speedyspeech-stat=dump/train/stats.npy \
|
||||
--pwg-config=../../parallelwave_gan/baker/conf/default.yaml \
|
||||
--pwg-checkpoint=../../parallelwave_gan/baker/exp/default/checkpoints/snapshot_iter_400000.pdz \
|
||||
--pwg-stat=../../parallelwave_gan/baker/dump/train/stats.npy \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=exp/default/test \
|
||||
--inference-dir=exp/default/inference \
|
||||
--device="gpu"
|
|
@ -0,0 +1,162 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import logging
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import paddle
|
||||
import yaml
|
||||
from paddle import jit
|
||||
from paddle.static import InputSpec
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from parakeet.models.speedyspeech import SpeedySpeech
|
||||
from parakeet.models.speedyspeech import SpeedySpeechInference
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator
|
||||
from parakeet.models.parallel_wavegan import PWGInference
|
||||
from parakeet.modules.normalizer import ZScore
|
||||
|
||||
from frontend import text_analysis
|
||||
|
||||
|
||||
def evaluate(args, speedyspeech_config, pwg_config):
|
||||
# dataloader has been too verbose
|
||||
logging.getLogger("DataLoader").disabled = True
|
||||
|
||||
# construct dataset for evaluation
|
||||
sentences = []
|
||||
with open(args.text, 'rt') as f:
|
||||
for line in f:
|
||||
utt_id, sentence = line.strip().split()
|
||||
sentences.append((utt_id, sentence))
|
||||
|
||||
model = SpeedySpeech(**speedyspeech_config["model"])
|
||||
model.set_state_dict(
|
||||
paddle.load(args.speedyspeech_checkpoint)["main_params"])
|
||||
model.eval()
|
||||
|
||||
vocoder = PWGGenerator(**pwg_config["generator_params"])
|
||||
vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"])
|
||||
vocoder.remove_weight_norm()
|
||||
vocoder.eval()
|
||||
print("model done!")
|
||||
|
||||
stat = np.load(args.speedyspeech_stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
speedyspeech_normalizer = ZScore(mu, std)
|
||||
|
||||
stat = np.load(args.pwg_stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
pwg_normalizer = ZScore(mu, std)
|
||||
|
||||
speedyspeech_inference = SpeedySpeechInference(speedyspeech_normalizer,
|
||||
model)
|
||||
speedyspeech_inference.eval()
|
||||
speedyspeech_inference = jit.to_static(
|
||||
speedyspeech_inference,
|
||||
input_spec=[
|
||||
InputSpec([-1], dtype=paddle.int64), InputSpec(
|
||||
[-1], dtype=paddle.int64)
|
||||
])
|
||||
paddle.jit.save(speedyspeech_inference,
|
||||
os.path.join(args.inference_dir, "speedyspeech"))
|
||||
speedyspeech_inference = paddle.jit.load(
|
||||
os.path.join(args.inference_dir, "speedyspeech"))
|
||||
|
||||
pwg_inference = PWGInference(pwg_normalizer, vocoder)
|
||||
pwg_inference.eval()
|
||||
pwg_inference = jit.to_static(
|
||||
pwg_inference, input_spec=[
|
||||
InputSpec([-1, 80], dtype=paddle.float32),
|
||||
])
|
||||
paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg"))
|
||||
pwg_inference = paddle.jit.load(os.path.join(args.inference_dir, "pwg"))
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for utt_id, sentence in sentences:
|
||||
phones, tones = text_analysis(sentence)
|
||||
|
||||
with paddle.no_grad():
|
||||
wav = pwg_inference(speedyspeech_inference(phones, tones))
|
||||
sf.write(
|
||||
output_dir / (utt_id + ".wav"),
|
||||
wav.numpy(),
|
||||
samplerate=speedyspeech_config.sr)
|
||||
print(f"{utt_id} done!")
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize with speedyspeech & parallel wavegan.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-config", type=str, help="config file for speedyspeech.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-checkpoint",
|
||||
type=str,
|
||||
help="speedyspeech checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pwg-config", type=str, help="config file for parallelwavegan.")
|
||||
parser.add_argument(
|
||||
"--pwg-checkpoint",
|
||||
type=str,
|
||||
help="parallel wavegan checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"--pwg-stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--text",
|
||||
type=str,
|
||||
help="text to synthesize, a 'utt_id sentence' pair per line")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
parser.add_argument(
|
||||
"--inference-dir", type=str, help="dir to save inference models")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
with open(args.speedyspeech_config) as f:
|
||||
speedyspeech_config = CfgNode(yaml.safe_load(f))
|
||||
with open(args.pwg_config) as f:
|
||||
pwg_config = CfgNode(yaml.safe_load(f))
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(speedyspeech_config)
|
||||
print(pwg_config)
|
||||
|
||||
evaluate(args, speedyspeech_config, pwg_config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,11 @@
|
|||
python synthesize_e2e.py \
|
||||
--speedyspeech-config=conf/default.yaml \
|
||||
--speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_91800.pdz \
|
||||
--speedyspeech-stat=dump/train/stats.npy \
|
||||
--pwg-config=../../parallelwave_gan/baker/conf/default.yaml \
|
||||
--pwg-checkpoint=../../parallelwave_gan/baker/exp/default/checkpoints/snapshot_iter_400000.pdz \
|
||||
--pwg-stat=../../parallelwave_gan/baker/dump/train/stats.npy \
|
||||
--text=sentences.txt \
|
||||
--output-dir=exp/default/e2e \
|
||||
--inference-dir=exp/default/inference \
|
||||
--device="gpu"
|
|
@ -0,0 +1,26 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import librosa
|
||||
|
||||
|
||||
def validate_textgrid(text_grid, num_samples, sr):
|
||||
"""Validate Text Grid to make sure that the time interval annotated
|
||||
by the tex grid file does not go beyond the audio file.
|
||||
"""
|
||||
start = text_grid.minTimestamp
|
||||
end = text_grid.maxTimestamp
|
||||
|
||||
end_audio = librosa.samples_to_time(num_samples, sr)
|
||||
return start == 0.0 and end <= end_audio
|
|
@ -0,0 +1,6 @@
|
|||
0
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
|
@ -0,0 +1,194 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
import paddle
|
||||
import yaml
|
||||
from paddle import distributed as dist
|
||||
from paddle import DataParallel
|
||||
from paddle import nn
|
||||
from paddle.io import DataLoader
|
||||
from paddle.io import DistributedBatchSampler
|
||||
from paddle.optimizer import Adam # No RAdam
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from parakeet.models.speedyspeech import SpeedySpeech
|
||||
from parakeet.training.extensions.snapshot import Snapshot
|
||||
from parakeet.training.extensions.visualizer import VisualDL
|
||||
from parakeet.training.seeding import seed_everything
|
||||
from parakeet.training.trainer import Trainer
|
||||
from pathlib import Path
|
||||
from visualdl import LogWriter
|
||||
|
||||
from batch_fn import collate_baker_examples
|
||||
from config import get_cfg_default
|
||||
from speedyspeech_updater import SpeedySpeechUpdater
|
||||
from speedyspeech_updater import SpeedySpeechEvaluator
|
||||
|
||||
|
||||
def train_sp(args, config):
|
||||
# decides device type and whether to run in parallel
|
||||
# setup running environment correctly
|
||||
world_size = paddle.distributed.get_world_size()
|
||||
if not paddle.is_compiled_with_cuda():
|
||||
paddle.set_device("cpu")
|
||||
else:
|
||||
paddle.set_device("gpu")
|
||||
if world_size > 1:
|
||||
paddle.distributed.init_parallel_env()
|
||||
|
||||
# set the random seed, it is a must for multiprocess training
|
||||
seed_everything(config.seed)
|
||||
|
||||
print(
|
||||
f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
|
||||
)
|
||||
|
||||
# dataloader has been too verbose
|
||||
logging.getLogger("DataLoader").disabled = True
|
||||
|
||||
# construct dataset for training and validation
|
||||
with jsonlines.open(args.train_metadata, 'r') as reader:
|
||||
train_metadata = list(reader)
|
||||
metadata_dir = Path(args.train_metadata).parent
|
||||
for item in train_metadata:
|
||||
item["feats"] = str(metadata_dir / item["feats"])
|
||||
|
||||
train_dataset = DataTable(
|
||||
data=train_metadata,
|
||||
fields=[
|
||||
"phones", "tones", "num_phones", "num_frames", "feats", "durations"
|
||||
],
|
||||
converters={
|
||||
"feats": np.load,
|
||||
}, )
|
||||
with jsonlines.open(args.dev_metadata, 'r') as reader:
|
||||
dev_metadata = list(reader)
|
||||
metadata_dir = Path(args.dev_metadata).parent
|
||||
for item in dev_metadata:
|
||||
item["feats"] = str(metadata_dir / item["feats"])
|
||||
dev_dataset = DataTable(
|
||||
data=dev_metadata,
|
||||
fields=[
|
||||
"phones", "tones", "num_phones", "num_frames", "feats", "durations"
|
||||
],
|
||||
converters={
|
||||
"feats": np.load,
|
||||
}, )
|
||||
|
||||
# collate function and dataloader
|
||||
train_sampler = DistributedBatchSampler(
|
||||
train_dataset,
|
||||
batch_size=config.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=True)
|
||||
print("samplers done!")
|
||||
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset,
|
||||
batch_sampler=train_sampler,
|
||||
collate_fn=collate_baker_examples,
|
||||
num_workers=config.num_workers)
|
||||
dev_dataloader = DataLoader(
|
||||
dev_dataset,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
batch_size=config.batch_size,
|
||||
collate_fn=collate_baker_examples,
|
||||
num_workers=config.num_workers)
|
||||
print("dataloaders done!")
|
||||
|
||||
model = SpeedySpeech(**config["model"])
|
||||
if world_size > 1:
|
||||
model = DataParallel(model) # TODO, do not use vocab size from config
|
||||
print("model done!")
|
||||
optimizer = Adam(
|
||||
0.001,
|
||||
parameters=model.parameters(),
|
||||
grad_clip=nn.ClipGradByGlobalNorm(5.0))
|
||||
print("optimizer done!")
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
updater = SpeedySpeechUpdater(
|
||||
model=model,
|
||||
optimizer=optimizer,
|
||||
dataloader=train_dataloader,
|
||||
output_dir=output_dir)
|
||||
|
||||
trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
|
||||
|
||||
evaluator = SpeedySpeechEvaluator(
|
||||
model, dev_dataloader, output_dir=output_dir)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
trainer.extend(evaluator, trigger=(1, "epoch"))
|
||||
writer = LogWriter(str(output_dir))
|
||||
trainer.extend(VisualDL(writer), trigger=(1, "iteration"))
|
||||
trainer.extend(
|
||||
Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
|
||||
# print(trainer.extensions)
|
||||
trainer.run()
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(description="Train a Speedyspeech "
|
||||
"model with Baker Mandrin TTS dataset.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data.")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--nprocs", type=int, default=1, help="number of processes.")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||
|
||||
args, rest = parser.parse_known_args()
|
||||
if args.device == "cpu" and args.nprocs > 1:
|
||||
raise RuntimeError("Multiprocess training on CPU is not supported.")
|
||||
config = get_cfg_default()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if rest:
|
||||
extra = []
|
||||
# to support key=value format
|
||||
for item in rest:
|
||||
extra.extend(item.split("=", maxsplit=1))
|
||||
config.merge_from_list(extra)
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(config)
|
||||
print(
|
||||
f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
|
||||
)
|
||||
|
||||
# dispatch
|
||||
if args.nprocs > 1:
|
||||
dist.spawn(train_sp, (args, config), nprocs=args.nprocs)
|
||||
else:
|
||||
train_sp(args, config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -58,10 +58,10 @@ For more help on arguments
|
|||
|
||||
## Synthesis
|
||||
|
||||
After training the Tacotron2, spectrogram can be synthesized by running ``synthesis.py``.
|
||||
After training the Tacotron2, spectrogram can be synthesized by running ``synthesize.py``.
|
||||
|
||||
```bash
|
||||
python synthesis.py \
|
||||
python synthesize.py \
|
||||
--config=${CONFIGPATH} \
|
||||
--checkpoint_path=${CHECKPOINTPATH} \
|
||||
--input=${TEXTPATH} \
|
||||
|
|
|
@ -46,8 +46,7 @@ class LJSpeech(Dataset):
|
|||
class LJSpeechCollector(object):
|
||||
"""A simple callable to batch LJSpeech examples."""
|
||||
|
||||
def __init__(self, padding_idx=0, padding_value=0.,
|
||||
padding_stop_token=1.0):
|
||||
def __init__(self, padding_idx=0, padding_value=0., padding_stop_token=1.0):
|
||||
self.padding_idx = padding_idx
|
||||
self.padding_value = padding_value
|
||||
self.padding_stop_token = padding_stop_token
|
||||
|
|
|
@ -63,8 +63,7 @@ def create_dataset(config, source_path, target_path, verbose=False):
|
|||
with open(target_path / "metadata.pkl", 'wb') as f:
|
||||
pickle.dump(records, f)
|
||||
if verbose:
|
||||
print("saved metadata into {}".format(target_path /
|
||||
"metadata.pkl"))
|
||||
print("saved metadata into {}".format(target_path / "metadata.pkl"))
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
|
|
@ -14,14 +14,13 @@
|
|||
|
||||
import time
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
|
||||
import paddle
|
||||
from paddle.io import DataLoader
|
||||
from paddle.io import DistributedBatchSampler
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader, DistributedBatchSampler
|
||||
|
||||
from parakeet.data import dataset
|
||||
from parakeet.frontend import EnglishCharacter # pylint: disable=unused-import
|
||||
from parakeet.training.cli import default_argument_parser
|
||||
from parakeet.training.experiment import ExperimentBase
|
||||
from parakeet.utils import display, mp_tools
|
||||
|
@ -74,8 +73,7 @@ class Experiment(ExperimentBase):
|
|||
|
||||
if dist.get_rank() == 0:
|
||||
for k, v in losses_np.items():
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v,
|
||||
self.iteration)
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
|
|
|
@ -65,8 +65,8 @@ def collate_aishell3_examples(examples):
|
|||
text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
|
||||
spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
|
||||
T_dec = np.max(spec_lengths)
|
||||
stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)
|
||||
).astype(np.float32)
|
||||
stop_tokens = (
|
||||
np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
|
||||
phones, _ = batch_text_id(phones)
|
||||
tones, _ = batch_text_id(tones)
|
||||
mel, _ = batch_spec(mel)
|
||||
|
|
|
@ -121,8 +121,8 @@ def convert(syllable):
|
|||
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
|
||||
|
||||
# expansion for un, ui, iu
|
||||
syllable = syllable.replace("un", "uen").replace(
|
||||
"ui", "uei").replace("iu", "iou")
|
||||
syllable = syllable.replace("un", "uen").replace("ui",
|
||||
"uei").replace("iu", "iou")
|
||||
|
||||
# rule for variants of i
|
||||
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
|
||||
|
|
|
@ -68,8 +68,7 @@ def preprocess_aishell3(source_dir, target_dir, alignment_dir):
|
|||
alignment_dir=alignment_dir)
|
||||
with Pool(16) as p:
|
||||
list(
|
||||
tqdm(
|
||||
p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
|
||||
tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -109,8 +109,7 @@ class Experiment(ExperimentBase):
|
|||
mel_pred = outputs['mel_outputs_postnet']
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_predicted_spectrogram",
|
||||
display.plot_spectrogram(mel_pred[0].numpy().T),
|
||||
self.iteration)
|
||||
display.plot_spectrogram(mel_pred[0].numpy().T), self.iteration)
|
||||
|
||||
# write visual log
|
||||
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
# Chinese Text Frontend Example
|
||||
Here's an example for Chinese text frontend, including g2p and text normalization.
|
||||
## G2P
|
||||
For g2p, we use BZNSYP's phone label as the ground truth and we delete silence tokens in labels and predicted phones.
|
||||
|
||||
You should Download BZNSYP from it's [Official Website](https://test.data-baker.com/data/index/source) and extract it. Assume the path to the dataset is `~/datasets/BZNSYP`.
|
||||
|
||||
We use `WER` as evaluation criterion.
|
||||
## Text Normalization
|
||||
For text normalization, the test data is `data/textnorm_test_cases.txt`, we use `|` as the separator of raw_data and normed_data.
|
||||
|
||||
We use `CER` as evaluation criterion.
|
||||
## Start
|
||||
If you want to use sclite to get more detail information of WER, you should run the command below to make sclite first.
|
||||
```bash
|
||||
./make_sclite.sh
|
||||
```
|
||||
Run the command below to get the results of test.
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
The `avg WER` of g2p is: 0.027495061517943988
|
||||
|
||||
The `avg CER` of text normalization is: 0.006388318503308237
|
|
@ -0,0 +1,125 @@
|
|||
今天的最低气温达到-10°C.|今天的最低气温达到零下十度.
|
||||
只要有33/4的人同意,就可以通过决议。|只要有四分之三十三的人同意,就可以通过决议。
|
||||
1945年5月2日,苏联士兵在德国国会大厦上升起了胜利旗,象征着攻占柏林并战胜了纳粹德国。|一九四五年五月二日,苏联士兵在德国国会大厦上升起了胜利旗,象征着攻占柏林并战胜了纳粹德国。
|
||||
4月16日,清晨的战斗以炮击揭幕,数以千计的大炮和喀秋莎火箭炮开始炮轰德军阵地,炮击持续了数天之久。|四月十六日,清晨的战斗以炮击揭幕,数以千计的大炮和喀秋莎火箭炮开始炮轰德军阵地,炮击持续了数天之久。
|
||||
如果剩下的30.6%是过去,那么还有69.4%.|如果剩下的百分之三十点六是过去,那么还有百分之六十九点四.
|
||||
事情发生在2020/03/31的上午8:00.|事情发生在二零二零年三月三十一日的上午八点.
|
||||
警方正在找一支.22口径的手枪。|警方正在找一支零点二二口径的手枪。
|
||||
欢迎致电中国联通,北京2022年冬奥会官方合作伙伴为您服务|欢迎致电中国联通,北京二零二二年冬奥会官方合作伙伴为您服务
|
||||
充值缴费请按1,查询话费及余量请按2,跳过本次提醒请按井号键。|充值缴费请按一,查询话费及余量请按二,跳过本次提醒请按井号键。
|
||||
快速解除流量封顶请按星号键,腾讯王卡产品介绍、使用说明、特权及活动请按9,查询话费、套餐余量、积分及活动返款请按1,手机上网流量开通及取消请按2,查询本机号码及本号所使用套餐请按4,密码修改及重置请按5,紧急开机请按6,挂失请按7,查询充值记录请按8,其它自助服务及人工服务请按0|快速解除流量封顶请按星号键,腾讯王卡产品介绍、使用说明、特权及活动请按九,查询话费、套餐余量、积分及活动返款请按一,手机上网流量开通及取消请按二,查询本机号码及本号所使用套餐请按四,密码修改及重置请按五,紧急开机请按六,挂失请按七,查询充值记录请按八,其它自助服务及人工服务请按零
|
||||
智能客服助理快速查话费、查流量请按9,了解北京联通业务请按1,宽带IPTV新装、查询请按2,障碍报修请按3,充值缴费请按4,投诉建议请按5,政企业务请按7,人工服务请按0,for english severice press star key|智能客服助理快速查话费、查流量请按九,了解北京联通业务请按一,宽带IPTV新装、查询请按二,障碍报修请按三,充值缴费请按四,投诉建议请按五,政企业务请按七,人工服务请按零,for english severice press star key
|
||||
您的帐户当前可用余额为63.89元,本月消费为2.17元。您的消费、套餐余量和其它信息将以短信形式下发,请您注意查收。谢谢使用,再见!。|您的帐户当前可用余额为六十三点八九元,本月消费为二点一七元。您的消费、套餐余量和其它信息将以短信形式下发,请您注意查收。谢谢使用,再见!。
|
||||
您的帐户当前可用余额为负15.5元,本月消费为59.6元。您的消费、套餐余量和其它信息将以短信形式下发,请您注意查收。谢谢使用,再见!。|您的帐户当前可用余额为负十五点五元,本月消费为五十九点六元。您的消费、套餐余量和其它信息将以短信形式下发,请您注意查收。谢谢使用,再见!。
|
||||
尊敬的客户,您目前的话费余额为负14.60元,已低于10元,为保证您的通信畅通,请及时缴纳费用。|尊敬的客户,您目前的话费余额为负十四点六元,已低于十元,为保证您的通信畅通,请及时缴纳费用。
|
||||
您的流量已用完,为避免您产生额外费用,建议您根据需求开通一个流量包以作补充。|您的流量已用完,为避免您产生额外费用,建议您根据需求开通一个流量包以作补充。
|
||||
您可以直接说,查询话费及余量、开通流量包、缴费,您也可以说出其它需求,请问有什么可以帮您?|您可以直接说,查询话费及余量、开通流量包、缴费,您也可以说出其它需求,请问有什么可以帮您?
|
||||
您的账户当前可用余额为负36.00元,本月消费36.00元。|您的账户当前可用余额为负三十六元,本月消费三十六元。
|
||||
请问你是电话13985608526的机主吗?|请问你是电话一三九八五六零八五二六的机主吗?
|
||||
如您对处理结果不满意,可拨打中国联通集团投诉电话10015进行投诉,按本地通话费收费,返回自助服务请按井号键|如您对处理结果不满意,可拨打中国联通集团投诉电话一零零一五进行投诉,按本地通话费收费,返回自助服务请按井号键
|
||||
“26314”号VIP客服代表为您服务。|“二六三一四”号VIP客服代表为您服务。
|
||||
尊敬的5G用户,欢迎您致电中国联通|尊敬的五G用户,欢迎您致电中国联通
|
||||
首先是应用了M1芯片的iPad Pro,新款的iPad Pro支持5G,这也是苹果的第二款5G产品线。|首先是应用了M一芯片的iPad Pro,新款的iPad Pro支持五G,这也是苹果的第二款五G产品线。
|
||||
除此之外,摄像头方面再次升级,增加了前摄全新超广角摄像头,支持人物居中功能,搭配超广角可实现视频中始终让人物居中效果。|除此之外,摄像头方面再次升级,增加了前摄全新超广角摄像头,支持人物居中功能,搭配超广角可实现视频中始终让人物居中效果。
|
||||
屏幕方面,iPad Pro 12.9版本支持XDR体验的Mini-LEDS显示屏,支持HDR10、杜比视界,还支持杜比全景声。|屏幕方面,iPad Pro 十二点九版本支持XDR体验的Mini-LEDS显示屏,支持HDR十、杜比视界,还支持杜比全景声。
|
||||
iPad Pro的秒控键盘这次也推出白色版本。|iPad Pro的秒控键盘这次也推出白色版本。
|
||||
售价方面,11英寸版本售价799美元起,12.9英寸售价1099美元起。|售价方面,十一英寸版本售价七百九十九美元起,十二点九英寸售价一千零九十九美元起。
|
||||
这块黄金重达324.75克|这块黄金重达三百二十四点七五克
|
||||
她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日,她弟弟出生于一九九五年三月一日
|
||||
电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九
|
||||
现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票
|
||||
随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万
|
||||
明天有62%的概率降雨|明天有百分之六十二的概率降雨
|
||||
这是固话0421-33441122|这是固话零四二一三三四四一一二二
|
||||
这是手机+86 18544139121|这是手机八六一八五四四一三九一二一
|
||||
小王的身高是153.5cm,梦想是打篮球!我觉得有0.1%的可能性。|小王的身高是一百五十三点五cm,梦想是打篮球!我觉得有百分之零点一的可能性。
|
||||
不管三七二十一|不管三七二十一
|
||||
九九八十一难|九九八十一难
|
||||
2018年5月23号上午10点10分|二零一八年五月二十三号上午十点十分
|
||||
10076|一零零七六
|
||||
32.68%|百分之三十二点六八
|
||||
比分测试17:16|比分测试十七比十六
|
||||
比分测试37:16|比分测试三十七比十六
|
||||
1.1|一点一
|
||||
一点一滴|一点一滴
|
||||
八九十|八九十
|
||||
1个人一定要|一个人一定要
|
||||
10000棵树|一万棵树
|
||||
1234个人|一千二百三十四个人
|
||||
35553座楼|三万五千五百五十三座楼
|
||||
15873690|一五八七三六九零
|
||||
27930122|二七九三零一二二
|
||||
85307499|八五三零七四九九
|
||||
26149787|二六一四九七八七
|
||||
15964862|一五九六四八六二
|
||||
45698723|四五六九八七二三
|
||||
48615964|四八六一五九六四
|
||||
17864589|一七八六四五八九
|
||||
123加456|一百二十三加四百五十六
|
||||
9786加3384|九千七百八十六加三千三百八十四
|
||||
发电站每天发电30029度电|发电站每天发电三万零二十九度电
|
||||
银行月交易总额七千九百零三亿元|银行月交易总额七千九百零三亿元
|
||||
深圳每月平均工资在13000元|深圳每月平均工资在一万三千元
|
||||
每月房租要交1500元|每月房租要交一千五百元
|
||||
我每月交通费用在400元左右|我每月交通费用在四百元左右
|
||||
本月开销费用是51328元|本月开销费用是五万一千三百二十八元
|
||||
如果你中了五千万元奖金会分我一半吗|如果你中了五千万元奖金会分我一半吗
|
||||
这个月工资我发了3529元|这个月工资我发了三千五百二十九元
|
||||
学会了这个技能你至少可以涨薪5000元|学会了这个技能你至少可以涨薪五千元
|
||||
我们的会议时间定在9点25分开始|我们的会议时间定在九点二十五分开始
|
||||
上课时间是8点15分请不要迟到|上课时间是八点十五分请不要迟到
|
||||
昨天你9点21分才到教室|昨天你九点二十一分才到教室
|
||||
今天是2019年1月31号|今天是二零一九年一月三十一号
|
||||
今年的除夕夜是2019年2月4号|今年的除夕夜是二零一九年二月四号
|
||||
这根水管的长度不超过35米|这根水管的长度不超过三十五米
|
||||
400米是最短的长跑距离|四百米是最短的长跑距离
|
||||
最高的撑杆跳为11米|最高的撑杆跳为十一米
|
||||
等会请在12:05请通知我|等会请在十二点零五分请通知我
|
||||
23点15分开始|二十三点十五分开始
|
||||
你生日那天我会送你999朵玫瑰|你生日那天我会送你九百九十九朵玫瑰
|
||||
给我1双鞋我可以跳96米远|给我一双鞋我可以跳九十六米远
|
||||
虽然我们的身高相差356毫米也不影响我们交往|虽然我们的身高相差三百五十六毫米也不影响我们交往
|
||||
我们班的最高总分为583分|我们班的最高总分为五百八十三分
|
||||
今天考试老师多扣了我21分|今天考试老师多扣了我二十一分
|
||||
我量过这张桌子总长为1.37米|我量过这张桌子总长为一点三七米
|
||||
乘务员身高必须超过185公分|乘务员身高必须超过一百八十五公分
|
||||
这台电脑分辨率为1024|这台电脑分辨率为一零二四
|
||||
手机价格不超过1500元|手机价格不超过一千五百元
|
||||
101.23|一百零一点二三
|
||||
123.116|一百二十三点一一六
|
||||
456.147|四百五十六点一四七
|
||||
0.1594|零点一五九四
|
||||
3.1415|三点一四一五
|
||||
0.112233|零点一一二二三三
|
||||
0.1|零点一
|
||||
40001.987|四万零一点九八七
|
||||
56.878|五十六点八七八
|
||||
0.00123|零点零零一二三
|
||||
0.0001|零点零零零一
|
||||
0.92015|零点九二零一五
|
||||
999.0001|九百九十九点零零零一
|
||||
10000.123|一万点一二三
|
||||
666.555|六百六十六点五五五
|
||||
444.789|四百四十四点七八九
|
||||
789.666|七百八十九点六六六
|
||||
0.12345|零点一二三四五
|
||||
1.05649|一点零五六四九
|
||||
环比上调1.86%|环比上调百分之一点八六
|
||||
环比分别下跌3.46%及微涨0.70%|环比分别下跌百分之三点四六及微涨百分之零点七
|
||||
单价在30000元的二手房购房个案当中|单价在三万元的二手房购房个案当中
|
||||
6月仍有7%单价在30000元的房源|六月仍有百分之七单价在三万元的房源
|
||||
最终也只是以总积分1分之差屈居第2|最终也只是以总积分一分之差屈居第二
|
||||
中新网8月29日电今日|中新网八月二十九日电今日
|
||||
自6月底呼和浩特市率先宣布取消限购后|自六月底呼和浩特市率先宣布取消限购后
|
||||
仅1个多月的时间里|仅一个多月的时间里
|
||||
除了北京上海广州深圳4个一线城市和三亚之外|除了北京上海广州深圳四个一线城市和三亚之外
|
||||
46个限购城市当中|四十六个限购城市当中
|
||||
41个已正式取消或变相放松了限购|四十一个已正式取消或变相放松了限购
|
||||
其中包括对拥有一套住房并已结清相应购房贷款的家庭|其中包括对拥有一套住房并已结清相应购房贷款的家庭
|
||||
这个后来被称为930新政策的措施|这个后来被称为九三零新政策的措施
|
||||
今年有望超三百亿美元|今年有望超三百亿美元
|
||||
就连一向看多的任志强|就连一向看多的任志强
|
||||
近期也一反常态地发表看空言论|近期也一反常态地发表看空言论
|
||||
985|九八五
|
||||
12~23|十二到二十三
|
||||
12-23|十二到二十三
|
|
@ -0,0 +1,90 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
from praatio import tgio
|
||||
|
||||
|
||||
def get_baker_data(root_dir):
|
||||
alignment_files = sorted(
|
||||
list((root_dir / "PhoneLabeling").rglob("*.interval")))
|
||||
text_file = root_dir / "ProsodyLabeling/000001-010000.txt"
|
||||
text_file = Path(text_file).expanduser()
|
||||
# filter out several files that have errors in annotation
|
||||
exclude = {'000611', '000662', '002365', '005107'}
|
||||
alignment_files = [f for f in alignment_files if f.stem not in exclude]
|
||||
data_dict = defaultdict(dict)
|
||||
for alignment_fp in alignment_files:
|
||||
alignment = tgio.openTextgrid(alignment_fp)
|
||||
# only with baker's annotation
|
||||
utt_id = alignment.tierNameList[0].split(".")[0]
|
||||
intervals = alignment.tierDict[alignment.tierNameList[0]].entryList
|
||||
phones = []
|
||||
for interval in intervals:
|
||||
label = interval.label
|
||||
phones.append(label)
|
||||
data_dict[utt_id]["phones"] = phones
|
||||
for line in open(text_file, "r"):
|
||||
if line.startswith("0"):
|
||||
utt_id, raw_text = line.strip().split()
|
||||
if utt_id in data_dict:
|
||||
data_dict[utt_id]['text'] = raw_text
|
||||
else:
|
||||
pinyin = line.strip().split()
|
||||
if utt_id in data_dict:
|
||||
data_dict[utt_id]['pinyin'] = pinyin
|
||||
return data_dict
|
||||
|
||||
|
||||
def get_g2p_phones(data_dict, frontend):
|
||||
for utt_id in data_dict:
|
||||
g2p_phones = frontend.get_phonemes(data_dict[utt_id]['text'])
|
||||
data_dict[utt_id]["g2p_phones"] = g2p_phones
|
||||
return data_dict
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="g2p example.")
|
||||
parser.add_argument(
|
||||
"--root-dir",
|
||||
default=None,
|
||||
type=str,
|
||||
help="directory to baker dataset.")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default="data/g2p",
|
||||
type=str,
|
||||
help="directory to output.")
|
||||
|
||||
args = parser.parse_args()
|
||||
root_dir = Path(args.root_dir).expanduser()
|
||||
output_dir = Path(args.output_dir).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
assert root_dir.is_dir()
|
||||
data_dict = get_baker_data(root_dir)
|
||||
raw_path = output_dir / "text"
|
||||
ref_path = output_dir / "text.ref"
|
||||
wf_raw = open(raw_path, "w")
|
||||
wf_ref = open(ref_path, "w")
|
||||
for utt_id in data_dict:
|
||||
wf_raw.write(utt_id + " " + data_dict[utt_id]['text'] + "\n")
|
||||
wf_ref.write(utt_id + " " + " ".join(data_dict[utt_id]['phones']) +
|
||||
"\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,50 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="text normalization example.")
|
||||
parser.add_argument(
|
||||
"--test-file",
|
||||
default="data/textnorm_test_cases.txt",
|
||||
type=str,
|
||||
help="path of text normalization test file.")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default="data/textnorm",
|
||||
type=str,
|
||||
help="directory to output.")
|
||||
|
||||
args = parser.parse_args()
|
||||
test_file = Path(args.test_file).expanduser()
|
||||
output_dir = Path(args.output_dir).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
raw_path = output_dir / "text"
|
||||
ref_path = output_dir / "text.ref"
|
||||
wf_raw = open(raw_path, "w")
|
||||
wf_ref = open(ref_path, "w")
|
||||
|
||||
with open(test_file, "r") as rf:
|
||||
for i, line in enumerate(rf):
|
||||
raw_text, normed_text = line.strip().split("|")
|
||||
wf_raw.write("utt_" + str(i) + " " + raw_text + "\n")
|
||||
wf_ref.write("utt_" + str(i) + " " + normed_text + "\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,13 @@
|
|||
#!/bin/bash
|
||||
|
||||
if [ ! -d "./SCTK" ];then
|
||||
echo "Clone SCTK ..."
|
||||
git clone https://github.com/usnistgov/SCTK
|
||||
echo "Clone SCTK done!"
|
||||
fi
|
||||
|
||||
if [ ! -d "./SCTK/bin" ];then
|
||||
echo "Start make SCTK ..."
|
||||
pushd SCTK && make config && make all && make check && make install && make doc && popd
|
||||
echo "SCTK make done!"
|
||||
fi
|
|
@ -0,0 +1,25 @@
|
|||
#!/bin/bash
|
||||
|
||||
USE_SCLITE=true
|
||||
|
||||
# test g2p
|
||||
echo "Start get g2p test data ..."
|
||||
python3 get_g2p_data.py --root-dir=~/datasets/BZNSYP --output-dir=data/g2p
|
||||
echo "Start test g2p ..."
|
||||
python3 test_g2p.py --input-dir=data/g2p --output-dir=exp/g2p
|
||||
|
||||
# test text normalization
|
||||
echo "Start get text normalization test data ..."
|
||||
python3 get_textnorm_data.py --test-file=data/textnorm_test_cases.txt --output-dir=data/textnorm
|
||||
echo "Start test text normalization ..."
|
||||
python3 test_textnorm.py --input-dir=data/textnorm --output-dir=exp/textnorm
|
||||
|
||||
# whether use sclite to get more detail information of WER
|
||||
if [ "$USE_SCLITE" = true ];then
|
||||
echo "Start sclite g2p ..."
|
||||
./SCTK/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean trn -h ./exp/g2p/text.g2p trn -e utf-8 -o all
|
||||
echo
|
||||
|
||||
echo "Start sclite textnorm ..."
|
||||
./SCTK/bin/sclite -i wsj -r ./exp/textnorm/text.ref.clean trn -h ./exp/textnorm/text.tn trn -e utf-8 -o all
|
||||
fi
|
|
@ -0,0 +1,99 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
from parakeet.frontend.cn_frontend import Frontend as cnFrontend
|
||||
from parakeet.utils.error_rate import word_errors
|
||||
|
||||
SILENCE_TOKENS = {"sp", "sil", "sp1", "spl"}
|
||||
|
||||
|
||||
def text_cleaner(raw_text):
|
||||
text = re.sub('#[1-4]|“|”|(|)', '', raw_text)
|
||||
text = text.replace("…。", "。")
|
||||
text = re.sub(':|;|——|……|、|…|—', ',', text)
|
||||
return text
|
||||
|
||||
|
||||
def get_avg_wer(raw_dict, ref_dict, frontend, output_dir):
|
||||
edit_distances = []
|
||||
ref_lens = []
|
||||
wf_g2p = open(output_dir / "text.g2p", "w")
|
||||
wf_ref = open(output_dir / "text.ref.clean", "w")
|
||||
for utt_id in raw_dict:
|
||||
if utt_id not in ref_dict:
|
||||
continue
|
||||
raw_text = raw_dict[utt_id]
|
||||
text = text_cleaner(raw_text)
|
||||
g2p_phones = frontend.get_phonemes(text)
|
||||
g2p_phones = sum(g2p_phones, [])
|
||||
gt_phones = ref_dict[utt_id].split(" ")
|
||||
# delete silence tokens in predicted phones and ground truth phones
|
||||
g2p_phones = [phn for phn in g2p_phones if phn not in SILENCE_TOKENS]
|
||||
gt_phones = [phn for phn in gt_phones if phn not in SILENCE_TOKENS]
|
||||
gt_phones = " ".join(gt_phones)
|
||||
g2p_phones = " ".join(g2p_phones)
|
||||
wf_ref.write(gt_phones + "(baker_" + utt_id + ")" + "\n")
|
||||
wf_g2p.write(g2p_phones + "(baker_" + utt_id + ")" + "\n")
|
||||
edit_distance, ref_len = word_errors(gt_phones, g2p_phones)
|
||||
edit_distances.append(edit_distance)
|
||||
ref_lens.append(ref_len)
|
||||
|
||||
return sum(edit_distances) / sum(ref_lens)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="g2p example.")
|
||||
parser.add_argument(
|
||||
"--input-dir",
|
||||
default="data/g2p",
|
||||
type=str,
|
||||
help="directory to preprocessed test data.")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default="exp/g2p",
|
||||
type=str,
|
||||
help="directory to save g2p results.")
|
||||
|
||||
args = parser.parse_args()
|
||||
input_dir = Path(args.input_dir).expanduser()
|
||||
output_dir = Path(args.output_dir).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
assert input_dir.is_dir()
|
||||
raw_dict, ref_dict = dict(), dict()
|
||||
raw_path = input_dir / "text"
|
||||
ref_path = input_dir / "text.ref"
|
||||
|
||||
with open(raw_path, "r") as rf:
|
||||
for line in rf:
|
||||
line = line.strip()
|
||||
line_list = line.split(" ")
|
||||
utt_id, raw_text = line_list[0], " ".join(line_list[1:])
|
||||
raw_dict[utt_id] = raw_text
|
||||
with open(ref_path, "r") as rf:
|
||||
for line in rf:
|
||||
line = line.strip()
|
||||
line_list = line.split(" ")
|
||||
utt_id, phones = line_list[0], " ".join(line_list[1:])
|
||||
ref_dict[utt_id] = phones
|
||||
frontend = cnFrontend()
|
||||
avg_wer = get_avg_wer(raw_dict, ref_dict, frontend, output_dir)
|
||||
print("The avg WER of g2p is:", avg_wer)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|