Merge pull request #129 from iclementine/speedyspeech
add speedyspeech model and example with baker dataset.
This commit is contained in:
commit
25788ab2ca
|
@ -6,18 +6,18 @@
|
||||||
###########################################################
|
###########################################################
|
||||||
# FEATURE EXTRACTION SETTING #
|
# FEATURE EXTRACTION SETTING #
|
||||||
###########################################################
|
###########################################################
|
||||||
sr: 24000 # Sampling rate.
|
sr: 24000 # Sampling rate.
|
||||||
n_fft: 2048 # FFT size.
|
n_fft: 2048 # FFT size (in samples).
|
||||||
hop_length: 300 # Hop size.
|
hop_length: 300 # Hop size (in samples).
|
||||||
win_length: 1200 # Window length.
|
win_length: 1200 # Window length (in samples).
|
||||||
# If set to null, it will be the same as fft_size.
|
# If set to null, it will be the same as fft_size.
|
||||||
window: "hann" # Window function.
|
window: "hann" # Window function.
|
||||||
n_mels: 80 # Number of mel basis.
|
n_mels: 80 # Number of mel basis.
|
||||||
fmin: 80 # Minimum freq in mel basis calculation.
|
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
|
||||||
fmax: 7600 # Maximum frequency in mel basis calculation.
|
fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
|
||||||
# global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
# global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
||||||
trim_silence: false # Whether to trim the start and end of silence.
|
trim_silence: false # Whether to trim the start and end of silence.
|
||||||
top_db: 60 # Need to tune carefully if the recording is not good.
|
top_db: 60 # Need to tune carefully if the recording is not good.
|
||||||
trim_frame_length: 2048 # Frame size in trimming.(in samples)
|
trim_frame_length: 2048 # Frame size in trimming.(in samples)
|
||||||
trim_hop_length: 512 # Hop size in trimming.(in samples)
|
trim_hop_length: 512 # Hop size in trimming.(in samples)
|
||||||
|
|
||||||
|
|
|
@ -202,14 +202,12 @@ def process_sentences(config,
|
||||||
def main():
|
def main():
|
||||||
# parse config and args
|
# parse config and args
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Preprocess audio and then extract features (See detail in parallel_wavegan/bin/preprocess.py)."
|
description="Preprocess audio and then extract features .")
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--rootdir",
|
"--rootdir",
|
||||||
default=None,
|
default=None,
|
||||||
type=str,
|
type=str,
|
||||||
help="directory including wav files. you need to specify either scp or rootdir."
|
help="directory to baker dataset.")
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dumpdir",
|
"--dumpdir",
|
||||||
type=str,
|
type=str,
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
python preprocess.py --rootdir=~/datasets/BZNSYP/ --dumpdir=dump --num_cpu=20
|
||||||
|
python compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" --dumpdir=dump/train
|
||||||
|
|
||||||
|
python normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/stats.npy
|
||||||
|
python normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/stats.npy
|
||||||
|
python normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/stats.npy
|
|
@ -0,0 +1,9 @@
|
||||||
|
FLAGS_cudnn_exhaustive_search=true \
|
||||||
|
FLAGS_conv_workspace_size_limit=4000 \
|
||||||
|
|
||||||
|
python train.py \
|
||||||
|
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||||
|
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||||
|
--config=conf/default.yaml \
|
||||||
|
--output-dir=exp/default \
|
||||||
|
--nprocs=1
|
|
@ -0,0 +1,43 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from parakeet.data.batch import batch_sequences
|
||||||
|
|
||||||
|
|
||||||
|
def collate_baker_examples(examples):
|
||||||
|
# fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"]
|
||||||
|
phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
|
||||||
|
tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
|
||||||
|
feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
|
||||||
|
durations = [
|
||||||
|
np.array(
|
||||||
|
item["durations"], dtype=np.int64) for item in examples
|
||||||
|
]
|
||||||
|
num_phones = np.array([item["num_phones"] for item in examples])
|
||||||
|
num_frames = np.array([item["num_frames"] for item in examples])
|
||||||
|
|
||||||
|
phones = batch_sequences(phones)
|
||||||
|
tones = batch_sequences(tones)
|
||||||
|
feats = batch_sequences(feats)
|
||||||
|
durations = batch_sequences(durations)
|
||||||
|
batch = {
|
||||||
|
"phones": phones,
|
||||||
|
"tones": tones,
|
||||||
|
"num_phones": num_phones,
|
||||||
|
"num_frames": num_frames,
|
||||||
|
"feats": feats,
|
||||||
|
"durations": durations,
|
||||||
|
}
|
||||||
|
return batch
|
|
@ -0,0 +1,109 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Calculate statistics of feature files."""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import yaml
|
||||||
|
import json
|
||||||
|
import jsonlines
|
||||||
|
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from parakeet.datasets.data_table import DataTable
|
||||||
|
from parakeet.utils.h5_utils import read_hdf5
|
||||||
|
from parakeet.utils.h5_utils import write_hdf5
|
||||||
|
|
||||||
|
from config import get_cfg_default
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run preprocessing process."""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Compute mean and variance of dumped raw features.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--metadata", type=str, help="json file with id and file paths ")
|
||||||
|
parser.add_argument(
|
||||||
|
"--field-name",
|
||||||
|
type=str,
|
||||||
|
help="name of the field to compute statistics for.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--config", type=str, help="yaml format configuration file.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
type=str,
|
||||||
|
help="path to save statistics. if not provided, "
|
||||||
|
"stats will be saved in the above root directory with name stats.npy")
|
||||||
|
parser.add_argument(
|
||||||
|
"--verbose",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="logging level. higher is more logging. (default=1)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# set logger
|
||||||
|
if args.verbose > 1:
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.DEBUG,
|
||||||
|
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||||
|
)
|
||||||
|
elif args.verbose > 0:
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.WARN,
|
||||||
|
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||||
|
)
|
||||||
|
logging.warning('Skip DEBUG/INFO messages')
|
||||||
|
|
||||||
|
config = get_cfg_default()
|
||||||
|
# load config
|
||||||
|
if args.config:
|
||||||
|
config.merge_from_file(args.config)
|
||||||
|
|
||||||
|
# check directory existence
|
||||||
|
if args.output is None:
|
||||||
|
args.output = Path(args.metadata).parent.with_name("stats.npy")
|
||||||
|
else:
|
||||||
|
args.output = Path(args.output)
|
||||||
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
with jsonlines.open(args.metadata, 'r') as reader:
|
||||||
|
metadata = list(reader)
|
||||||
|
dataset = DataTable(
|
||||||
|
metadata,
|
||||||
|
fields=[args.field_name],
|
||||||
|
converters={args.field_name: np.load}, )
|
||||||
|
logging.info(f"The number of files = {len(dataset)}.")
|
||||||
|
|
||||||
|
# calculate statistics
|
||||||
|
scaler = StandardScaler()
|
||||||
|
for datum in tqdm(dataset):
|
||||||
|
# StandardScalar supports (*, num_features) by default
|
||||||
|
scaler.partial_fit(datum[args.field_name])
|
||||||
|
|
||||||
|
stats = np.stack([scaler.mean_, scaler.scale_], axis=0)
|
||||||
|
np.save(str(args.output), stats.astype(np.float32), allow_pickle=False)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,60 @@
|
||||||
|
###########################################################
|
||||||
|
# FEATURE EXTRACTION SETTING #
|
||||||
|
###########################################################
|
||||||
|
sr: 24000 # Sampling rate.
|
||||||
|
n_fft: 2048 # FFT size.
|
||||||
|
hop_length: 300 # Hop size.
|
||||||
|
win_length: 1200 # Window length.
|
||||||
|
# If set to null, it will be the same as fft_size.
|
||||||
|
window: "hann" # Window function.
|
||||||
|
n_mels: 80 # Number of mel basis.
|
||||||
|
fmin: 80 # Minimum freq in mel basis calculation.
|
||||||
|
fmax: 7600 # Maximum frequency in mel basis calculation.
|
||||||
|
# global_gain_scale: 1.0 # Will be multiplied to all of waveform.
|
||||||
|
trim_silence: false # Whether to trim the start and end of silence.
|
||||||
|
top_db: 60 # Need to tune carefully if the recording is not good.
|
||||||
|
trim_frame_length: 2048 # Frame size in trimming.(in samples)
|
||||||
|
trim_hop_length: 512 # Hop size in trimming.(in samples)
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DATA SETTING #
|
||||||
|
###########################################################
|
||||||
|
batch_size: 32
|
||||||
|
num_workers: 4
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# MODEL SETTING #
|
||||||
|
###########################################################
|
||||||
|
model:
|
||||||
|
vocab_size: 101 # 99 + 2
|
||||||
|
tone_size: 8 # 6 + 2
|
||||||
|
encoder_hidden_size: 128
|
||||||
|
encoder_kernel_size: 3
|
||||||
|
encoder_dilations: [1, 3, 9, 27, 1, 3, 9, 27, 1, 1]
|
||||||
|
duration_predictor_hidden_size: 128
|
||||||
|
decoder_hidden_size: 128
|
||||||
|
decoder_output_size: 80
|
||||||
|
decoder_kernel_size: 3
|
||||||
|
decoder_dilations: [1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 1]
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OPTIMIZER SETTING #
|
||||||
|
###########################################################
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# TRAINING SETTING #
|
||||||
|
###########################################################
|
||||||
|
max_epoch: 300
|
||||||
|
num_snapshots: 5
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
seed: 10086
|
|
@ -0,0 +1,25 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from yacs.config import CfgNode as Configuration
|
||||||
|
|
||||||
|
with open("conf/default.yaml", 'rt') as f:
|
||||||
|
_C = yaml.safe_load(f)
|
||||||
|
_C = Configuration(_C)
|
||||||
|
|
||||||
|
|
||||||
|
def get_cfg_default():
|
||||||
|
config = _C.clone()
|
||||||
|
return config
|
|
@ -0,0 +1,92 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
import pypinyin
|
||||||
|
from pypinyin import lazy_pinyin, Style
|
||||||
|
import jieba
|
||||||
|
import phkit
|
||||||
|
phkit.initialize()
|
||||||
|
from parakeet.frontend.vocab import Vocab
|
||||||
|
|
||||||
|
with open("phones.txt", 'rt') as f:
|
||||||
|
phones = [line.strip() for line in f.readlines()]
|
||||||
|
|
||||||
|
with open("tones.txt", 'rt') as f:
|
||||||
|
tones = [line.strip() for line in f.readlines()]
|
||||||
|
voc_phones = Vocab(phones, start_symbol=None, end_symbol=None)
|
||||||
|
voc_tones = Vocab(tones, start_symbol=None, end_symbol=None)
|
||||||
|
|
||||||
|
|
||||||
|
def segment(sentence):
|
||||||
|
segments = re.split(r'[:,;。?!]', sentence)
|
||||||
|
segments = [seg for seg in segments if len(seg)]
|
||||||
|
return segments
|
||||||
|
|
||||||
|
|
||||||
|
def g2p(sentence):
|
||||||
|
segments = segment(sentence)
|
||||||
|
phones = []
|
||||||
|
phones.append('sil')
|
||||||
|
tones = []
|
||||||
|
tones.append('0')
|
||||||
|
|
||||||
|
for seg in segments:
|
||||||
|
seg = jieba.lcut(seg)
|
||||||
|
initials = lazy_pinyin(
|
||||||
|
seg, neutral_tone_with_five=True, style=Style.INITIALS)
|
||||||
|
finals = lazy_pinyin(
|
||||||
|
seg, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
||||||
|
for c, v in zip(initials, finals):
|
||||||
|
# NOTE: post process for pypinyin outputs
|
||||||
|
# we discriminate i, ii and iii
|
||||||
|
if re.match(r'i\d', v):
|
||||||
|
if c in ['z', 'c', 's']:
|
||||||
|
v = re.sub('i', 'ii', v)
|
||||||
|
elif c in ['zh', 'ch', 'sh', 'r']:
|
||||||
|
v = re.sub('i', 'iii', v)
|
||||||
|
if c:
|
||||||
|
phones.append(c)
|
||||||
|
tones.append('0')
|
||||||
|
if v:
|
||||||
|
phones.append(v[:-1])
|
||||||
|
tones.append(v[-1])
|
||||||
|
phones.append('sp')
|
||||||
|
tones.append('0')
|
||||||
|
phones[-1] = 'sil'
|
||||||
|
tones[-1] = '0'
|
||||||
|
return (phones, tones)
|
||||||
|
|
||||||
|
|
||||||
|
def p2id(voc, phonemes):
|
||||||
|
phone_ids = [voc.lookup(item) for item in phonemes]
|
||||||
|
return np.array(phone_ids, np.int64)
|
||||||
|
|
||||||
|
|
||||||
|
def t2id(voc, tones):
|
||||||
|
tone_ids = [voc.lookup(item) for item in tones]
|
||||||
|
return np.array(tone_ids, np.int64)
|
||||||
|
|
||||||
|
|
||||||
|
def text_analysis(sentence):
|
||||||
|
phonemes, tones = g2p(sentence)
|
||||||
|
print(sentence)
|
||||||
|
print([p + t if t != '0' else p for p, t in zip(phonemes, tones)])
|
||||||
|
phone_ids = p2id(voc_phones, phonemes)
|
||||||
|
tone_ids = t2id(voc_tones, tones)
|
||||||
|
phones = paddle.to_tensor(phone_ids)
|
||||||
|
tones = paddle.to_tensor(tone_ids)
|
||||||
|
return phones, tones
|
|
@ -0,0 +1,150 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Normalize feature files and dump them."""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from copy import copy
|
||||||
|
from operator import itemgetter
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import yaml
|
||||||
|
import jsonlines
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from parakeet.frontend.vocab import Vocab
|
||||||
|
from parakeet.datasets.data_table import DataTable
|
||||||
|
|
||||||
|
from config import get_cfg_default
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run preprocessing process."""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--metadata",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="directory including feature files to be normalized. "
|
||||||
|
"you need to specify either *-scp or rootdir.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--dumpdir",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="directory to dump normalized feature files.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--stats", type=str, required=True, help="statistics file.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--phones",
|
||||||
|
type=str,
|
||||||
|
default="phones.txt",
|
||||||
|
help="phone vocabulary file.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--tones", type=str, default="tones.txt", help="tone vocabulary file.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--config", type=str, help="yaml format configuration file.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--verbose",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="logging level. higher is more logging. (default=1)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# set logger
|
||||||
|
if args.verbose > 1:
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.DEBUG,
|
||||||
|
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||||
|
)
|
||||||
|
elif args.verbose > 0:
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.WARN,
|
||||||
|
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||||
|
)
|
||||||
|
logging.warning('Skip DEBUG/INFO messages')
|
||||||
|
|
||||||
|
# load config
|
||||||
|
config = get_cfg_default()
|
||||||
|
if args.config:
|
||||||
|
config.merge_from_file(args.config)
|
||||||
|
|
||||||
|
# check directory existence
|
||||||
|
dumpdir = Path(args.dumpdir).resolve()
|
||||||
|
dumpdir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# get dataset
|
||||||
|
with jsonlines.open(args.metadata, 'r') as reader:
|
||||||
|
metadata = list(reader)
|
||||||
|
dataset = DataTable(metadata, converters={'feats': np.load, })
|
||||||
|
logging.info(f"The number of files = {len(dataset)}.")
|
||||||
|
|
||||||
|
# restore scaler
|
||||||
|
scaler = StandardScaler()
|
||||||
|
scaler.mean_ = np.load(args.stats)[0]
|
||||||
|
scaler.scale_ = np.load(args.stats)[1]
|
||||||
|
|
||||||
|
# from version 0.23.0, this information is needed
|
||||||
|
scaler.n_features_in_ = scaler.mean_.shape[0]
|
||||||
|
|
||||||
|
with open(args.phones, 'rt') as f:
|
||||||
|
phones = [line.strip() for line in f.readlines()]
|
||||||
|
|
||||||
|
with open(args.tones, 'rt') as f:
|
||||||
|
tones = [line.strip() for line in f.readlines()]
|
||||||
|
voc_phones = Vocab(phones, start_symbol=None, end_symbol=None)
|
||||||
|
voc_tones = Vocab(tones, start_symbol=None, end_symbol=None)
|
||||||
|
|
||||||
|
# process each file
|
||||||
|
output_metadata = []
|
||||||
|
|
||||||
|
for item in tqdm(dataset):
|
||||||
|
utt_id = item['utt_id']
|
||||||
|
mel = item['feats']
|
||||||
|
# normalize
|
||||||
|
mel = scaler.transform(mel)
|
||||||
|
|
||||||
|
# save
|
||||||
|
mel_path = dumpdir / f"{utt_id}-feats.npy"
|
||||||
|
np.save(mel_path, mel.astype(np.float32), allow_pickle=False)
|
||||||
|
phone_ids = [voc_phones.lookup(p) for p in item['phones']]
|
||||||
|
tone_ids = [voc_tones.lookup(t) for t in item['tones']]
|
||||||
|
output_metadata.append({
|
||||||
|
'utt_id': utt_id,
|
||||||
|
'phones': phone_ids,
|
||||||
|
'tones': tone_ids,
|
||||||
|
'num_phones': item['num_phones'],
|
||||||
|
'num_frames': item['num_frames'],
|
||||||
|
'durations': item['durations'],
|
||||||
|
'feats': str(mel_path),
|
||||||
|
})
|
||||||
|
output_metadata.sort(key=itemgetter('utt_id'))
|
||||||
|
output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
|
||||||
|
with jsonlines.open(output_metadata_path, 'w') as writer:
|
||||||
|
for item in output_metadata:
|
||||||
|
writer.write(item)
|
||||||
|
logging.info(f"metadata dumped into {output_metadata_path}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,99 @@
|
||||||
|
b
|
||||||
|
p
|
||||||
|
m
|
||||||
|
f
|
||||||
|
d
|
||||||
|
t
|
||||||
|
n
|
||||||
|
l
|
||||||
|
g
|
||||||
|
k
|
||||||
|
h
|
||||||
|
zh
|
||||||
|
ch
|
||||||
|
sh
|
||||||
|
r
|
||||||
|
z
|
||||||
|
c
|
||||||
|
s
|
||||||
|
j
|
||||||
|
q
|
||||||
|
x
|
||||||
|
a
|
||||||
|
ar
|
||||||
|
ai
|
||||||
|
air
|
||||||
|
ao
|
||||||
|
aor
|
||||||
|
an
|
||||||
|
anr
|
||||||
|
ang
|
||||||
|
angr
|
||||||
|
e
|
||||||
|
er
|
||||||
|
ei
|
||||||
|
eir
|
||||||
|
en
|
||||||
|
enr
|
||||||
|
eng
|
||||||
|
engr
|
||||||
|
o
|
||||||
|
or
|
||||||
|
ou
|
||||||
|
our
|
||||||
|
ong
|
||||||
|
ongr
|
||||||
|
ii
|
||||||
|
iir
|
||||||
|
iii
|
||||||
|
iiir
|
||||||
|
i
|
||||||
|
ir
|
||||||
|
ia
|
||||||
|
iar
|
||||||
|
iao
|
||||||
|
iaor
|
||||||
|
ian
|
||||||
|
ianr
|
||||||
|
iang
|
||||||
|
iangr
|
||||||
|
ie
|
||||||
|
ier
|
||||||
|
io
|
||||||
|
ior
|
||||||
|
iou
|
||||||
|
iour
|
||||||
|
iong
|
||||||
|
iongr
|
||||||
|
in
|
||||||
|
inr
|
||||||
|
ing
|
||||||
|
ingr
|
||||||
|
u
|
||||||
|
ur
|
||||||
|
ua
|
||||||
|
uar
|
||||||
|
uai
|
||||||
|
uair
|
||||||
|
uan
|
||||||
|
uanr
|
||||||
|
uang
|
||||||
|
uangr
|
||||||
|
uei
|
||||||
|
ueir
|
||||||
|
uo
|
||||||
|
uor
|
||||||
|
uen
|
||||||
|
uenr
|
||||||
|
ueng
|
||||||
|
uengr
|
||||||
|
v
|
||||||
|
vr
|
||||||
|
ve
|
||||||
|
ver
|
||||||
|
van
|
||||||
|
vanr
|
||||||
|
vn
|
||||||
|
vnr
|
||||||
|
sil
|
||||||
|
sp
|
|
@ -0,0 +1,309 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
import soundfile as sf
|
||||||
|
import librosa
|
||||||
|
import numpy as np
|
||||||
|
import argparse
|
||||||
|
import yaml
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import jsonlines
|
||||||
|
import concurrent.futures
|
||||||
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||||
|
from pathlib import Path
|
||||||
|
import tqdm
|
||||||
|
from operator import itemgetter
|
||||||
|
from praatio import tgio
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from config import get_cfg_default
|
||||||
|
from tg_utils import validate_textgrid
|
||||||
|
|
||||||
|
|
||||||
|
def logmelfilterbank(audio,
|
||||||
|
sr,
|
||||||
|
n_fft=1024,
|
||||||
|
hop_length=256,
|
||||||
|
win_length=None,
|
||||||
|
window="hann",
|
||||||
|
n_mels=80,
|
||||||
|
fmin=None,
|
||||||
|
fmax=None,
|
||||||
|
eps=1e-10):
|
||||||
|
"""Compute log-Mel filterbank feature.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
audio : ndarray
|
||||||
|
Audio signal (T,).
|
||||||
|
sr : int
|
||||||
|
Sampling rate.
|
||||||
|
n_fft : int
|
||||||
|
FFT size. (Default value = 1024)
|
||||||
|
hop_length : int
|
||||||
|
Hop size. (Default value = 256)
|
||||||
|
win_length : int
|
||||||
|
Window length. If set to None, it will be the same as fft_size. (Default value = None)
|
||||||
|
window : str
|
||||||
|
Window function type. (Default value = "hann")
|
||||||
|
n_mels : int
|
||||||
|
Number of mel basis. (Default value = 80)
|
||||||
|
fmin : int
|
||||||
|
Minimum frequency in mel basis calculation. (Default value = None)
|
||||||
|
fmax : int
|
||||||
|
Maximum frequency in mel basis calculation. (Default value = None)
|
||||||
|
eps : float
|
||||||
|
Epsilon value to avoid inf in log calculation. (Default value = 1e-10)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.ndarray
|
||||||
|
Log Mel filterbank feature (#frames, num_mels).
|
||||||
|
|
||||||
|
"""
|
||||||
|
# get amplitude spectrogram
|
||||||
|
x_stft = librosa.stft(
|
||||||
|
audio,
|
||||||
|
n_fft=n_fft,
|
||||||
|
hop_length=hop_length,
|
||||||
|
win_length=win_length,
|
||||||
|
window=window,
|
||||||
|
pad_mode="reflect")
|
||||||
|
spc = np.abs(x_stft) # (#bins, #frames,)
|
||||||
|
|
||||||
|
# get mel basis
|
||||||
|
fmin = 0 if fmin is None else fmin
|
||||||
|
fmax = sr / 2 if fmax is None else fmax
|
||||||
|
mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
|
||||||
|
|
||||||
|
return np.log10(np.maximum(eps, np.dot(mel_basis, spc)))
|
||||||
|
|
||||||
|
|
||||||
|
def process_sentence(config: Dict[str, Any],
|
||||||
|
fp: Path,
|
||||||
|
alignment_fp: Path,
|
||||||
|
output_dir: Path):
|
||||||
|
utt_id = fp.stem
|
||||||
|
|
||||||
|
# reading
|
||||||
|
y, sr = librosa.load(fp, sr=config.sr) # resampling may occur
|
||||||
|
assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
|
||||||
|
assert np.abs(y).max(
|
||||||
|
) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
|
||||||
|
duration = librosa.get_duration(y, sr=sr)
|
||||||
|
|
||||||
|
# intervals with empty lables are ignored
|
||||||
|
alignment = tgio.openTextgrid(alignment_fp)
|
||||||
|
|
||||||
|
# validate text grid against audio file
|
||||||
|
num_samples = y.shape[0]
|
||||||
|
validate_textgrid(alignment, num_samples, sr)
|
||||||
|
|
||||||
|
# only with baker's annotation
|
||||||
|
intervals = alignment.tierDict[alignment.tierNameList[0]].entryList
|
||||||
|
|
||||||
|
first, last = intervals[0], intervals[-1]
|
||||||
|
if not (first.label == "sil" and first.end < duration):
|
||||||
|
logging.warning(
|
||||||
|
f" There is something wrong with the fisrt interval {first} in utterance: {utt_id}"
|
||||||
|
)
|
||||||
|
if not (last.label == "sil" and last.start < duration):
|
||||||
|
logging.warning(
|
||||||
|
f" There is something wrong with the last interval {last} in utterance: {utt_id}"
|
||||||
|
)
|
||||||
|
|
||||||
|
logmel = logmelfilterbank(
|
||||||
|
y,
|
||||||
|
sr=sr,
|
||||||
|
n_fft=config.n_fft,
|
||||||
|
window=config.window,
|
||||||
|
win_length=config.win_length,
|
||||||
|
hop_length=config.hop_length,
|
||||||
|
n_mels=config.n_mels,
|
||||||
|
fmin=config.fmin,
|
||||||
|
fmax=config.fmax)
|
||||||
|
|
||||||
|
# extract phone and duration
|
||||||
|
phones = []
|
||||||
|
tones = []
|
||||||
|
ends = []
|
||||||
|
durations_sec = []
|
||||||
|
|
||||||
|
for interval in intervals:
|
||||||
|
label = interval.label
|
||||||
|
label = label.replace("sp1", "sp") # Baker has sp1 rather than sp
|
||||||
|
|
||||||
|
# split tone from finals
|
||||||
|
match = re.match(r'^(\w+)([012345])$', label)
|
||||||
|
if match:
|
||||||
|
phones.append(match.group(1))
|
||||||
|
tones.append(match.group(2))
|
||||||
|
else:
|
||||||
|
phones.append(label)
|
||||||
|
tones.append('0')
|
||||||
|
end = min(duration, interval.end)
|
||||||
|
ends.append(end)
|
||||||
|
durations_sec.append(end - interval.start) # duration in seconds
|
||||||
|
|
||||||
|
frame_pos = librosa.time_to_frames(
|
||||||
|
ends, sr=sr, hop_length=config.hop_length)
|
||||||
|
durations_frame = np.diff(frame_pos, prepend=0)
|
||||||
|
|
||||||
|
num_frames = logmel.shape[-1] # number of frames of the spectrogram
|
||||||
|
extra = np.sum(durations_frame) - num_frames
|
||||||
|
assert extra <= 0, (
|
||||||
|
f"Number of frames inferred from alignemnt is "
|
||||||
|
f"larger than number of frames of the spectrogram by {extra} frames")
|
||||||
|
durations_frame[-1] += (-extra)
|
||||||
|
|
||||||
|
assert np.sum(durations_frame) == num_frames
|
||||||
|
durations_frame = durations_frame.tolist()
|
||||||
|
|
||||||
|
mel_path = output_dir / (utt_id + "_feats.npy")
|
||||||
|
np.save(mel_path, logmel.T) # (num_frames, n_mels)
|
||||||
|
record = {
|
||||||
|
"utt_id": utt_id,
|
||||||
|
"phones": phones,
|
||||||
|
"tones": tones,
|
||||||
|
"num_phones": len(phones),
|
||||||
|
"num_frames": num_frames,
|
||||||
|
"durations": durations_frame,
|
||||||
|
"feats": str(mel_path.resolve()), # use absolute path
|
||||||
|
}
|
||||||
|
return record
|
||||||
|
|
||||||
|
|
||||||
|
def process_sentences(config,
|
||||||
|
fps: List[Path],
|
||||||
|
alignment_fps: List[Path],
|
||||||
|
output_dir: Path,
|
||||||
|
nprocs: int=1):
|
||||||
|
if nprocs == 1:
|
||||||
|
results = []
|
||||||
|
for fp, alignment_fp in tqdm.tqdm(
|
||||||
|
zip(fps, alignment_fps), total=len(fps)):
|
||||||
|
results.append(
|
||||||
|
process_sentence(config, fp, alignment_fp, output_dir))
|
||||||
|
else:
|
||||||
|
with ThreadPoolExecutor(nprocs) as pool:
|
||||||
|
futures = []
|
||||||
|
with tqdm.tqdm(total=len(fps)) as progress:
|
||||||
|
for fp, alignment_fp in zip(fps, alignment_fps):
|
||||||
|
future = pool.submit(process_sentence, config, fp,
|
||||||
|
alignment_fp, output_dir)
|
||||||
|
future.add_done_callback(lambda p: progress.update())
|
||||||
|
futures.append(future)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for ft in futures:
|
||||||
|
results.append(ft.result())
|
||||||
|
|
||||||
|
results.sort(key=itemgetter("utt_id"))
|
||||||
|
with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
|
||||||
|
for item in results:
|
||||||
|
writer.write(item)
|
||||||
|
print("Done")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# parse config and args
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Preprocess audio and then extract features.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--rootdir",
|
||||||
|
default=None,
|
||||||
|
type=str,
|
||||||
|
help="directory to baker dataset.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--dumpdir",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="directory to dump feature files.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--config", type=str, help="yaml format configuration file.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--verbose",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="logging level. higher is more logging. (default=1)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--num_cpu", type=int, default=1, help="number of process.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
C = get_cfg_default()
|
||||||
|
if args.config:
|
||||||
|
C.merge_from_file(args.config)
|
||||||
|
C.freeze()
|
||||||
|
|
||||||
|
if args.verbose > 1:
|
||||||
|
print(vars(args))
|
||||||
|
print(C)
|
||||||
|
|
||||||
|
root_dir = Path(args.rootdir).expanduser()
|
||||||
|
dumpdir = Path(args.dumpdir).expanduser()
|
||||||
|
dumpdir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
wav_files = sorted(list((root_dir / "Wave").rglob("*.wav")))
|
||||||
|
alignment_files = sorted(
|
||||||
|
list((root_dir / "PhoneLabeling").rglob("*.interval")))
|
||||||
|
|
||||||
|
# filter out several files that have errors in annotation
|
||||||
|
exclude = {'000611', '000662', '002365', '005107'}
|
||||||
|
wav_files = [f for f in wav_files if f.stem not in exclude]
|
||||||
|
alignment_files = [f for f in alignment_files if f.stem not in exclude]
|
||||||
|
|
||||||
|
# split data into 3 sections
|
||||||
|
num_train = 9800
|
||||||
|
num_dev = 100
|
||||||
|
|
||||||
|
train_wav_files = wav_files[:num_train]
|
||||||
|
dev_wav_files = wav_files[num_train:num_train + num_dev]
|
||||||
|
test_wav_files = wav_files[num_train + num_dev:]
|
||||||
|
|
||||||
|
train_alignment_files = alignment_files[:num_train]
|
||||||
|
dev_alignment_files = alignment_files[num_train:num_train + num_dev]
|
||||||
|
test_alignment_files = alignment_files[num_train + num_dev:]
|
||||||
|
|
||||||
|
train_dump_dir = dumpdir / "train" / "raw"
|
||||||
|
train_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
dev_dump_dir = dumpdir / "dev" / "raw"
|
||||||
|
dev_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
test_dump_dir = dumpdir / "test" / "raw"
|
||||||
|
test_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# process for the 3 sections
|
||||||
|
process_sentences(
|
||||||
|
C,
|
||||||
|
train_wav_files,
|
||||||
|
train_alignment_files,
|
||||||
|
train_dump_dir,
|
||||||
|
nprocs=args.num_cpu)
|
||||||
|
process_sentences(
|
||||||
|
C,
|
||||||
|
dev_wav_files,
|
||||||
|
dev_alignment_files,
|
||||||
|
dev_dump_dir,
|
||||||
|
nprocs=args.num_cpu)
|
||||||
|
process_sentences(
|
||||||
|
C,
|
||||||
|
test_wav_files,
|
||||||
|
test_alignment_files,
|
||||||
|
test_dump_dir,
|
||||||
|
nprocs=args.num_cpu)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,6 @@
|
||||||
|
python preprocess.py --rootdir=~/datasets/BZNSYP/ --dumpdir=dump --num_cpu=20
|
||||||
|
python compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" --output=dump/train/stats.npy
|
||||||
|
|
||||||
|
python normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/stats.npy
|
||||||
|
python normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/stats.npy
|
||||||
|
python normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/stats.npy
|
|
@ -0,0 +1,6 @@
|
||||||
|
python train.py \
|
||||||
|
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||||
|
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||||
|
--config=conf/default.yaml \
|
||||||
|
--output-dir=exp/default \
|
||||||
|
--nprocs=1
|
|
@ -0,0 +1,16 @@
|
||||||
|
001 凯莫瑞安联合体的经济崩溃,迫在眉睫。
|
||||||
|
002 对于所有想要离开那片废土,去寻找更美好生活的人来说。
|
||||||
|
003 克哈,是你们所有人安全的港湾。
|
||||||
|
004 为了保护尤摩扬人民不受异虫的残害,我所做的,比他们自己的领导委员会都多。
|
||||||
|
005 无论他们如何诽谤我,我将继续为所有泰伦人的最大利益,而努力奋斗。
|
||||||
|
006 身为你们的元首,我带领泰伦人实现了人类统治领地和经济的扩张。
|
||||||
|
007 我们将继续成长,用行动回击那些只会说风凉话,不愿意和我们相向而行的害群之马。
|
||||||
|
008 帝国武装力量,无数的优秀儿女,正时刻守卫着我们的家园大门,但是他们孤木难支。
|
||||||
|
009 凡是今天应征入伍者,所获的所有刑罚罪责,减半。
|
||||||
|
010 激进分子和异见者希望你们一听见枪声,就背弃多年的和平与繁荣。
|
||||||
|
011 他们没有勇气和能力,带领人类穿越一个充满危险的星系。
|
||||||
|
012 法治是我们的命脉,然而它却受到前所未有的挑战。
|
||||||
|
013 我将恢复我们帝国的荣光,绝不会向任何外星势力低头。
|
||||||
|
014 我已经驯服了异虫,荡平了星灵。如今它们的创造者,想要夺走我们拥有的一切。
|
||||||
|
015 永远记住,谁才是最能保护你们的人。
|
||||||
|
016 不要听信别人的谗言,我不是什么克隆人。
|
|
@ -0,0 +1,110 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle.nn import functional as F
|
||||||
|
from paddle.fluid.layers import huber_loss
|
||||||
|
|
||||||
|
from parakeet.modules.ssim import ssim
|
||||||
|
from parakeet.modules.losses import masked_l1_loss, weighted_mean
|
||||||
|
from parakeet.training.reporter import report
|
||||||
|
from parakeet.training.updaters.standard_updater import StandardUpdater
|
||||||
|
from parakeet.training.extensions.evaluator import StandardEvaluator
|
||||||
|
from parakeet.models.speedyspeech import SpeedySpeech
|
||||||
|
|
||||||
|
|
||||||
|
class SpeedySpeechUpdater(StandardUpdater):
|
||||||
|
def update_core(self, batch):
|
||||||
|
decoded, predicted_durations = self.model(
|
||||||
|
text=batch["phones"],
|
||||||
|
tones=batch["tones"],
|
||||||
|
plens=batch["num_phones"],
|
||||||
|
durations=batch["durations"])
|
||||||
|
|
||||||
|
target_mel = batch["feats"]
|
||||||
|
spec_mask = F.sequence_mask(
|
||||||
|
batch["num_frames"], dtype=target_mel.dtype).unsqueeze(-1)
|
||||||
|
text_mask = F.sequence_mask(
|
||||||
|
batch["num_phones"], dtype=predicted_durations.dtype)
|
||||||
|
|
||||||
|
# spec loss
|
||||||
|
l1_loss = masked_l1_loss(decoded, target_mel, spec_mask)
|
||||||
|
|
||||||
|
# duration loss
|
||||||
|
target_durations = batch["durations"]
|
||||||
|
target_durations = paddle.maximum(
|
||||||
|
target_durations.astype(predicted_durations.dtype),
|
||||||
|
paddle.to_tensor([1.0]))
|
||||||
|
duration_loss = weighted_mean(
|
||||||
|
huber_loss(
|
||||||
|
predicted_durations, paddle.log(target_durations), delta=1.0),
|
||||||
|
text_mask, )
|
||||||
|
|
||||||
|
# ssim loss
|
||||||
|
ssim_loss = 1.0 - ssim((decoded * spec_mask).unsqueeze(1),
|
||||||
|
(target_mel * spec_mask).unsqueeze(1))
|
||||||
|
|
||||||
|
loss = l1_loss + ssim_loss + duration_loss
|
||||||
|
|
||||||
|
optimizer = self.optimizer
|
||||||
|
optimizer.clear_grad()
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
report("train/loss", float(loss))
|
||||||
|
report("train/l1_loss", float(l1_loss))
|
||||||
|
report("train/duration_loss", float(duration_loss))
|
||||||
|
report("train/ssim_loss", float(ssim_loss))
|
||||||
|
|
||||||
|
|
||||||
|
class SpeedySpeechEvaluator(StandardEvaluator):
|
||||||
|
def evaluate_core(self, batch):
|
||||||
|
print("fire")
|
||||||
|
decoded, predicted_durations = self.model(
|
||||||
|
text=batch["phones"],
|
||||||
|
tones=batch["tones"],
|
||||||
|
plens=batch["num_phones"],
|
||||||
|
durations=batch["durations"])
|
||||||
|
|
||||||
|
target_mel = batch["feats"]
|
||||||
|
spec_mask = F.sequence_mask(
|
||||||
|
batch["num_frames"], dtype=target_mel.dtype).unsqueeze(-1)
|
||||||
|
text_mask = F.sequence_mask(
|
||||||
|
batch["num_phones"], dtype=predicted_durations.dtype)
|
||||||
|
|
||||||
|
# spec loss
|
||||||
|
l1_loss = masked_l1_loss(decoded, target_mel, spec_mask)
|
||||||
|
|
||||||
|
# duration loss
|
||||||
|
target_durations = batch["durations"]
|
||||||
|
target_durations = paddle.maximum(
|
||||||
|
target_durations.astype(predicted_durations.dtype),
|
||||||
|
paddle.to_tensor([1.0]))
|
||||||
|
duration_loss = weighted_mean(
|
||||||
|
huber_loss(
|
||||||
|
predicted_durations, paddle.log(target_durations), delta=1.0),
|
||||||
|
text_mask, )
|
||||||
|
|
||||||
|
# ssim loss
|
||||||
|
ssim_loss = 1.0 - ssim((decoded * spec_mask).unsqueeze(1),
|
||||||
|
(target_mel * spec_mask).unsqueeze(1))
|
||||||
|
|
||||||
|
loss = l1_loss + ssim_loss + duration_loss
|
||||||
|
|
||||||
|
# import pdb; pdb.set_trace()
|
||||||
|
|
||||||
|
report("eval/loss", float(loss))
|
||||||
|
report("eval/l1_loss", float(l1_loss))
|
||||||
|
report("eval/duration_loss", float(duration_loss))
|
||||||
|
report("eval/ssim_loss", float(ssim_loss))
|
|
@ -0,0 +1,146 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
import argparse
|
||||||
|
import dataclasses
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
import jsonlines
|
||||||
|
import paddle
|
||||||
|
import numpy as np
|
||||||
|
import soundfile as sf
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
from paddle.nn import functional as F
|
||||||
|
from paddle import distributed as dist
|
||||||
|
from yacs.config import CfgNode
|
||||||
|
|
||||||
|
from parakeet.datasets.data_table import DataTable
|
||||||
|
from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference
|
||||||
|
from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
|
||||||
|
from parakeet.modules.normalizer import ZScore
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(args, speedyspeech_config, pwg_config):
|
||||||
|
# dataloader has been too verbose
|
||||||
|
logging.getLogger("DataLoader").disabled = True
|
||||||
|
|
||||||
|
# construct dataset for evaluation
|
||||||
|
with jsonlines.open(args.test_metadata, 'r') as reader:
|
||||||
|
test_metadata = list(reader)
|
||||||
|
test_dataset = DataTable(
|
||||||
|
data=test_metadata, fields=["utt_id", "phones", "tones"])
|
||||||
|
|
||||||
|
model = SpeedySpeech(**speedyspeech_config["model"])
|
||||||
|
model.set_state_dict(
|
||||||
|
paddle.load(args.speedyspeech_checkpoint)["main_params"])
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
vocoder = PWGGenerator(**pwg_config["generator_params"])
|
||||||
|
vocoder.set_state_dict(paddle.load(args.pwg_params))
|
||||||
|
vocoder.remove_weight_norm()
|
||||||
|
vocoder.eval()
|
||||||
|
print("model done!")
|
||||||
|
|
||||||
|
stat = np.load(args.speedyspeech_stat)
|
||||||
|
mu, std = stat
|
||||||
|
mu = paddle.to_tensor(mu)
|
||||||
|
std = paddle.to_tensor(std)
|
||||||
|
speedyspeech_normalizer = ZScore(mu, std)
|
||||||
|
|
||||||
|
stat = np.load(args.pwg_stat)
|
||||||
|
mu, std = stat
|
||||||
|
mu = paddle.to_tensor(mu)
|
||||||
|
std = paddle.to_tensor(std)
|
||||||
|
pwg_normalizer = ZScore(mu, std)
|
||||||
|
|
||||||
|
speedyspeech_inferencce = SpeedySpeechInference(speedyspeech_normalizer,
|
||||||
|
model)
|
||||||
|
pwg_inference = PWGInference(pwg_normalizer, vocoder)
|
||||||
|
|
||||||
|
output_dir = Path(args.output_dir)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
for datum in test_dataset:
|
||||||
|
utt_id = datum["utt_id"]
|
||||||
|
phones = paddle.to_tensor(datum["phones"])
|
||||||
|
tones = paddle.to_tensor(datum["tones"])
|
||||||
|
|
||||||
|
with paddle.no_grad():
|
||||||
|
wav = pwg_inference(speedyspeech_inferencce(phones, tones))
|
||||||
|
sf.write(
|
||||||
|
output_dir / (utt_id + ".wav"),
|
||||||
|
wav.numpy(),
|
||||||
|
samplerate=speedyspeech_config.sr)
|
||||||
|
print(f"{utt_id} done!")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# parse args and config and redirect to train_sp
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Synthesize with speedyspeech & parallel wavegan.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--speedyspeech-config",
|
||||||
|
type=str,
|
||||||
|
help="config file to overwrite default config")
|
||||||
|
parser.add_argument(
|
||||||
|
"--speedyspeech-checkpoint",
|
||||||
|
type=str,
|
||||||
|
help="speedyspeech checkpoint to load.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--speedyspeech-stat",
|
||||||
|
type=str,
|
||||||
|
help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--pwg-config",
|
||||||
|
type=str,
|
||||||
|
help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--pwg-params",
|
||||||
|
type=str,
|
||||||
|
help="parallel wavegan generator parameters to load.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--pwg-stat",
|
||||||
|
type=str,
|
||||||
|
help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
|
||||||
|
)
|
||||||
|
parser.add_argument("--test-metadata", type=str, help="test metadata")
|
||||||
|
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||||
|
parser.add_argument(
|
||||||
|
"--device", type=str, default="gpu", help="device type to use")
|
||||||
|
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
with open(args.speedyspeech_config) as f:
|
||||||
|
speedyspeech_config = CfgNode(yaml.safe_load(f))
|
||||||
|
with open(args.pwg_config) as f:
|
||||||
|
pwg_config = CfgNode(yaml.safe_load(f))
|
||||||
|
|
||||||
|
print("========Args========")
|
||||||
|
print(yaml.safe_dump(vars(args)))
|
||||||
|
print("========Config========")
|
||||||
|
print(speedyspeech_config)
|
||||||
|
print(pwg_config)
|
||||||
|
|
||||||
|
evaluate(args, speedyspeech_config, pwg_config)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,10 @@
|
||||||
|
python synthesize.py \
|
||||||
|
--speedyspeech-config=conf/default.yaml \
|
||||||
|
--speedyspeech-checkpoint=exp/debug/checkpoints/snapshot_iter_91800.pdz \
|
||||||
|
--speedyspeech-stat=dump/train/stats.npy \
|
||||||
|
--pwg-config=../../parallelwave_gan/baker/conf/default.yaml \
|
||||||
|
--pwg-params=../../parallelwave_gan/baker/converted.pdparams \
|
||||||
|
--pwg-stat=../../parallelwave_gan/baker/dump/train/stats.npy \
|
||||||
|
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||||
|
--output-dir=exp/debug/test \
|
||||||
|
--device="gpu"
|
|
@ -0,0 +1,150 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
import argparse
|
||||||
|
import dataclasses
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
import jsonlines
|
||||||
|
import paddle
|
||||||
|
import numpy as np
|
||||||
|
import soundfile as sf
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
from paddle.nn import functional as F
|
||||||
|
from paddle import distributed as dist
|
||||||
|
from yacs.config import CfgNode
|
||||||
|
|
||||||
|
from parakeet.datasets.data_table import DataTable
|
||||||
|
from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference
|
||||||
|
from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
|
||||||
|
from parakeet.modules.normalizer import ZScore
|
||||||
|
|
||||||
|
from frontend import text_analysis
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(args, speedyspeech_config, pwg_config):
|
||||||
|
# dataloader has been too verbose
|
||||||
|
logging.getLogger("DataLoader").disabled = True
|
||||||
|
|
||||||
|
# construct dataset for evaluation
|
||||||
|
sentences = []
|
||||||
|
with open(args.text, 'rt') as f:
|
||||||
|
for line in f:
|
||||||
|
utt_id, sentence = line.strip().split()
|
||||||
|
sentences.append((utt_id, sentence))
|
||||||
|
|
||||||
|
model = SpeedySpeech(**speedyspeech_config["model"])
|
||||||
|
model.set_state_dict(
|
||||||
|
paddle.load(args.speedyspeech_checkpoint)["main_params"])
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
vocoder = PWGGenerator(**pwg_config["generator_params"])
|
||||||
|
vocoder.set_state_dict(paddle.load(args.pwg_params))
|
||||||
|
vocoder.remove_weight_norm()
|
||||||
|
vocoder.eval()
|
||||||
|
print("model done!")
|
||||||
|
|
||||||
|
stat = np.load(args.speedyspeech_stat)
|
||||||
|
mu, std = stat
|
||||||
|
mu = paddle.to_tensor(mu)
|
||||||
|
std = paddle.to_tensor(std)
|
||||||
|
speedyspeech_normalizer = ZScore(mu, std)
|
||||||
|
|
||||||
|
stat = np.load(args.pwg_stat)
|
||||||
|
mu, std = stat
|
||||||
|
mu = paddle.to_tensor(mu)
|
||||||
|
std = paddle.to_tensor(std)
|
||||||
|
pwg_normalizer = ZScore(mu, std)
|
||||||
|
|
||||||
|
speedyspeech_inferencce = SpeedySpeechInference(speedyspeech_normalizer,
|
||||||
|
model)
|
||||||
|
pwg_inference = PWGInference(pwg_normalizer, vocoder)
|
||||||
|
|
||||||
|
output_dir = Path(args.output_dir)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
for utt_id, sentence in sentences:
|
||||||
|
phones, tones = text_analysis(sentence)
|
||||||
|
|
||||||
|
with paddle.no_grad():
|
||||||
|
wav = pwg_inference(speedyspeech_inferencce(phones, tones))
|
||||||
|
sf.write(
|
||||||
|
output_dir / (utt_id + ".wav"),
|
||||||
|
wav.numpy(),
|
||||||
|
samplerate=speedyspeech_config.sr)
|
||||||
|
print(f"{utt_id} done!")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# parse args and config and redirect to train_sp
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Synthesize with speedyspeech & parallel wavegan.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--speedyspeech-config",
|
||||||
|
type=str,
|
||||||
|
help="config file to overwrite default config")
|
||||||
|
parser.add_argument(
|
||||||
|
"--speedyspeech-checkpoint",
|
||||||
|
type=str,
|
||||||
|
help="speedyspeech checkpoint to load.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--speedyspeech-stat",
|
||||||
|
type=str,
|
||||||
|
help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--pwg-config",
|
||||||
|
type=str,
|
||||||
|
help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--pwg-params",
|
||||||
|
type=str,
|
||||||
|
help="parallel wavegan generator parameters to load.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--pwg-stat",
|
||||||
|
type=str,
|
||||||
|
help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--text",
|
||||||
|
type=str,
|
||||||
|
help="text to synthesize, a 'utt_id sentence' pair per line")
|
||||||
|
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||||
|
parser.add_argument(
|
||||||
|
"--device", type=str, default="gpu", help="device type to use")
|
||||||
|
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
with open(args.speedyspeech_config) as f:
|
||||||
|
speedyspeech_config = CfgNode(yaml.safe_load(f))
|
||||||
|
with open(args.pwg_config) as f:
|
||||||
|
pwg_config = CfgNode(yaml.safe_load(f))
|
||||||
|
|
||||||
|
print("========Args========")
|
||||||
|
print(yaml.safe_dump(vars(args)))
|
||||||
|
print("========Config========")
|
||||||
|
print(speedyspeech_config)
|
||||||
|
print(pwg_config)
|
||||||
|
|
||||||
|
evaluate(args, speedyspeech_config, pwg_config)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,10 @@
|
||||||
|
python synthesize_e2e.py \
|
||||||
|
--speedyspeech-config=conf/default.yaml \
|
||||||
|
--speedyspeech-checkpoint=exp/debug/checkpoints/snapshot_iter_91800.pdz \
|
||||||
|
--speedyspeech-stat=dump/train/stats.npy \
|
||||||
|
--pwg-config=../../parallelwave_gan/baker/conf/default.yaml \
|
||||||
|
--pwg-params=../../parallelwave_gan/baker/converted.pdparams \
|
||||||
|
--pwg-stat=../../parallelwave_gan/baker/dump/train/stats.npy \
|
||||||
|
--text=sentences.txt \
|
||||||
|
--output-dir=exp/e2e \
|
||||||
|
--device="gpu"
|
|
@ -0,0 +1,27 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import librosa
|
||||||
|
from praatio import tgio
|
||||||
|
|
||||||
|
|
||||||
|
def validate_textgrid(text_grid, num_samples, sr):
|
||||||
|
"""Validate Text Grid to make sure that the time interval annotated
|
||||||
|
by the tex grid file does not go beyond the audio file.
|
||||||
|
"""
|
||||||
|
start = text_grid.minTimestamp
|
||||||
|
end = text_grid.maxTimestamp
|
||||||
|
|
||||||
|
end_audio = librosa.samples_to_time(num_samples, sr)
|
||||||
|
return start == 0.0 and end <= end_audio
|
|
@ -0,0 +1,6 @@
|
||||||
|
0
|
||||||
|
1
|
||||||
|
2
|
||||||
|
3
|
||||||
|
4
|
||||||
|
5
|
|
@ -0,0 +1,186 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
import argparse
|
||||||
|
import dataclasses
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
import jsonlines
|
||||||
|
import paddle
|
||||||
|
import numpy as np
|
||||||
|
from paddle import nn
|
||||||
|
from paddle.nn import functional as F
|
||||||
|
from paddle import distributed as dist
|
||||||
|
from paddle.io import DataLoader, DistributedBatchSampler
|
||||||
|
from paddle.optimizer import Adam # No RAdaom
|
||||||
|
from paddle.optimizer.lr import StepDecay
|
||||||
|
from paddle import DataParallel
|
||||||
|
from visualdl import LogWriter
|
||||||
|
|
||||||
|
from parakeet.datasets.data_table import DataTable
|
||||||
|
from parakeet.models.speedyspeech import SpeedySpeech
|
||||||
|
|
||||||
|
from parakeet.training.updater import UpdaterBase
|
||||||
|
from parakeet.training.trainer import Trainer
|
||||||
|
from parakeet.training.reporter import report
|
||||||
|
from parakeet.training import extension
|
||||||
|
from parakeet.training.extensions.snapshot import Snapshot
|
||||||
|
from parakeet.training.extensions.visualizer import VisualDL
|
||||||
|
from parakeet.training.seeding import seed_everything
|
||||||
|
|
||||||
|
from batch_fn import collate_baker_examples
|
||||||
|
from speedyspeech_updater import SpeedySpeechUpdater, SpeedySpeechEvaluator
|
||||||
|
from config import get_cfg_default
|
||||||
|
|
||||||
|
|
||||||
|
def train_sp(args, config):
|
||||||
|
# decides device type and whether to run in parallel
|
||||||
|
# setup running environment correctly
|
||||||
|
if not paddle.is_compiled_with_cuda:
|
||||||
|
paddle.set_device("cpu")
|
||||||
|
else:
|
||||||
|
paddle.set_device("gpu")
|
||||||
|
world_size = paddle.distributed.get_world_size()
|
||||||
|
if world_size > 1:
|
||||||
|
paddle.distributed.init_parallel_env()
|
||||||
|
|
||||||
|
# set the random seed, it is a must for multiprocess training
|
||||||
|
seed_everything(config.seed)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# dataloader has been too verbose
|
||||||
|
logging.getLogger("DataLoader").disabled = True
|
||||||
|
|
||||||
|
# construct dataset for training and validation
|
||||||
|
with jsonlines.open(args.train_metadata, 'r') as reader:
|
||||||
|
train_metadata = list(reader)
|
||||||
|
train_dataset = DataTable(
|
||||||
|
data=train_metadata,
|
||||||
|
fields=[
|
||||||
|
"phones", "tones", "num_phones", "num_frames", "feats", "durations"
|
||||||
|
],
|
||||||
|
converters={"feats": np.load, }, )
|
||||||
|
with jsonlines.open(args.dev_metadata, 'r') as reader:
|
||||||
|
dev_metadata = list(reader)
|
||||||
|
dev_dataset = DataTable(
|
||||||
|
data=dev_metadata,
|
||||||
|
fields=[
|
||||||
|
"phones", "tones", "num_phones", "num_frames", "feats", "durations"
|
||||||
|
],
|
||||||
|
converters={"feats": np.load, }, )
|
||||||
|
|
||||||
|
# collate function and dataloader
|
||||||
|
train_sampler = DistributedBatchSampler(
|
||||||
|
train_dataset,
|
||||||
|
batch_size=config.batch_size,
|
||||||
|
shuffle=False,
|
||||||
|
drop_last=True)
|
||||||
|
# dev_sampler = DistributedBatchSampler(dev_dataset,
|
||||||
|
# batch_size=config.batch_size,
|
||||||
|
# shuffle=False,
|
||||||
|
# drop_last=False)
|
||||||
|
print("samplers done!")
|
||||||
|
|
||||||
|
train_dataloader = DataLoader(
|
||||||
|
train_dataset,
|
||||||
|
batch_sampler=train_sampler,
|
||||||
|
collate_fn=collate_baker_examples,
|
||||||
|
num_workers=config.num_workers)
|
||||||
|
dev_dataloader = DataLoader(
|
||||||
|
dev_dataset,
|
||||||
|
shuffle=False,
|
||||||
|
drop_last=False,
|
||||||
|
batch_size=config.batch_size,
|
||||||
|
collate_fn=collate_baker_examples,
|
||||||
|
num_workers=config.num_workers)
|
||||||
|
print("dataloaders done!")
|
||||||
|
|
||||||
|
# batch = collate_baker_examples([train_dataset[i] for i in range(10)])
|
||||||
|
# # batch = collate_baker_examples([dev_dataset[i] for i in range(10)])
|
||||||
|
# import pdb; pdb.set_trace()
|
||||||
|
model = SpeedySpeech(**config["model"])
|
||||||
|
if world_size > 1:
|
||||||
|
model = DataParallel(model) # TODO, do not use vocab size from config
|
||||||
|
# print(model)
|
||||||
|
print("model done!")
|
||||||
|
optimizer = Adam(
|
||||||
|
0.001,
|
||||||
|
parameters=model.parameters(),
|
||||||
|
grad_clip=nn.ClipGradByGlobalNorm(5.0))
|
||||||
|
print("optimizer done!")
|
||||||
|
|
||||||
|
updater = SpeedySpeechUpdater(
|
||||||
|
model=model, optimizer=optimizer, dataloader=train_dataloader)
|
||||||
|
|
||||||
|
output_dir = Path(args.output_dir)
|
||||||
|
trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
|
||||||
|
|
||||||
|
evaluator = SpeedySpeechEvaluator(model, dev_dataloader)
|
||||||
|
|
||||||
|
if dist.get_rank() == 0:
|
||||||
|
trainer.extend(evaluator, trigger=(1, "epoch"))
|
||||||
|
writer = LogWriter(str(output_dir))
|
||||||
|
trainer.extend(VisualDL(writer), trigger=(1, "iteration"))
|
||||||
|
trainer.extend(
|
||||||
|
Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
|
||||||
|
print(trainer.extensions)
|
||||||
|
trainer.run()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# parse args and config and redirect to train_sp
|
||||||
|
parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN "
|
||||||
|
"model with Baker Mandrin TTS dataset.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--config", type=str, help="config file to overwrite default config")
|
||||||
|
parser.add_argument("--train-metadata", type=str, help="training data")
|
||||||
|
parser.add_argument("--dev-metadata", type=str, help="dev data")
|
||||||
|
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||||
|
parser.add_argument(
|
||||||
|
"--device", type=str, default="gpu", help="device type to use")
|
||||||
|
parser.add_argument(
|
||||||
|
"--nprocs", type=int, default=1, help="number of processes")
|
||||||
|
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
if args.device == "cpu" and args.nprocs > 1:
|
||||||
|
raise RuntimeError("Multiprocess training on CPU is not supported.")
|
||||||
|
config = get_cfg_default()
|
||||||
|
if args.config:
|
||||||
|
config.merge_from_file(args.config)
|
||||||
|
|
||||||
|
print("========Args========")
|
||||||
|
print(yaml.safe_dump(vars(args)))
|
||||||
|
print("========Config========")
|
||||||
|
print(config)
|
||||||
|
print(
|
||||||
|
f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# dispatch
|
||||||
|
if args.nprocs > 1:
|
||||||
|
dist.spawn(train_sp, (args, config), nprocs=args.nprocs)
|
||||||
|
else:
|
||||||
|
train_sp(args, config)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -161,3 +161,27 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
|
||||||
mode='constant',
|
mode='constant',
|
||||||
constant_values=pad_value))
|
constant_values=pad_value))
|
||||||
return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
|
return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
|
||||||
|
|
||||||
|
|
||||||
|
def batch_sequences(sequences, axis=0, pad_value=0):
|
||||||
|
# import pdb; pdb.set_trace()
|
||||||
|
seq = sequences[0]
|
||||||
|
ndim = seq.ndim
|
||||||
|
if axis < 0:
|
||||||
|
axis += ndim
|
||||||
|
dtype = seq.dtype
|
||||||
|
pad_value = dtype.type(pad_value)
|
||||||
|
seq_lengths = [seq.shape[axis] for seq in sequences]
|
||||||
|
max_length = np.max(seq_lengths)
|
||||||
|
|
||||||
|
padded_sequences = []
|
||||||
|
for seq, length in zip(sequences, seq_lengths):
|
||||||
|
padding = [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (
|
||||||
|
ndim - axis - 1)
|
||||||
|
padded_seq = np.pad(seq,
|
||||||
|
padding,
|
||||||
|
mode='constant',
|
||||||
|
constant_values=pad_value)
|
||||||
|
padded_sequences.append(padded_seq)
|
||||||
|
batch = np.stack(padded_sequences)
|
||||||
|
return batch
|
||||||
|
|
|
@ -768,3 +768,15 @@ class ResidualPWGDiscriminator(nn.Layer):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
self.apply(_remove_weight_norm)
|
self.apply(_remove_weight_norm)
|
||||||
|
|
||||||
|
|
||||||
|
class PWGInference(nn.Layer):
|
||||||
|
def __init__(self, normalizer, pwg_generator):
|
||||||
|
super().__init__()
|
||||||
|
self.normalizer = normalizer
|
||||||
|
self.pwg_generator = pwg_generator
|
||||||
|
|
||||||
|
def forward(self, logmel):
|
||||||
|
normalized_mel = self.normalizer(logmel)
|
||||||
|
wav = self.pwg_generator.inference(normalized_mel)
|
||||||
|
return wav
|
||||||
|
|
|
@ -0,0 +1,226 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import math
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
from paddle import Tensor
|
||||||
|
from paddle import nn
|
||||||
|
from paddle.nn import functional as F
|
||||||
|
from paddle.nn import initializer as I
|
||||||
|
|
||||||
|
from parakeet.modules.positional_encoding import sinusoid_position_encoding
|
||||||
|
from parakeet.modules.expansion import expand
|
||||||
|
|
||||||
|
|
||||||
|
class ResidualBlock(nn.Layer):
|
||||||
|
def __init__(self, channels, kernel_size, dilation, n=2):
|
||||||
|
super().__init__()
|
||||||
|
blocks = [
|
||||||
|
nn.Sequential(
|
||||||
|
nn.Conv1D(
|
||||||
|
channels,
|
||||||
|
channels,
|
||||||
|
kernel_size,
|
||||||
|
dilation=dilation,
|
||||||
|
padding="same",
|
||||||
|
data_format="NLC"),
|
||||||
|
nn.ReLU(),
|
||||||
|
nn.BatchNorm1D(
|
||||||
|
channels, data_format="NLC"), ) for _ in range(n)
|
||||||
|
]
|
||||||
|
self.blocks = nn.Sequential(*blocks)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return x + self.blocks(x)
|
||||||
|
|
||||||
|
|
||||||
|
class TextEmbedding(nn.Layer):
|
||||||
|
def __init__(self,
|
||||||
|
vocab_size: int,
|
||||||
|
embedding_size: int,
|
||||||
|
tone_vocab_size: int=None,
|
||||||
|
tone_embedding_size: int=None,
|
||||||
|
padding_idx: int=None,
|
||||||
|
tone_padding_idx: int=None,
|
||||||
|
concat: bool=False):
|
||||||
|
super().__init__()
|
||||||
|
self.text_embedding = nn.Embedding(vocab_size, embedding_size,
|
||||||
|
padding_idx)
|
||||||
|
if tone_vocab_size:
|
||||||
|
tone_embedding_size = tone_embedding_size or embedding_size
|
||||||
|
if tone_embedding_size != embedding_size and not concat:
|
||||||
|
raise ValueError(
|
||||||
|
"embedding size != tone_embedding size, only conat is avaiable."
|
||||||
|
)
|
||||||
|
self.tone_embedding = nn.Embedding(
|
||||||
|
tone_vocab_size, tone_embedding_size, tone_padding_idx)
|
||||||
|
self.concat = concat
|
||||||
|
|
||||||
|
def forward(self, text, tone=None):
|
||||||
|
text_embed = self.text_embedding(text)
|
||||||
|
if tone is None:
|
||||||
|
return text_embed
|
||||||
|
tone_embed = self.tone_embedding(tone)
|
||||||
|
if self.concat:
|
||||||
|
embed = paddle.concat([text_embed, tone_embed], -1)
|
||||||
|
else:
|
||||||
|
embed = text_embed + tone_embed
|
||||||
|
return embed
|
||||||
|
|
||||||
|
|
||||||
|
class SpeedySpeechEncoder(nn.Layer):
|
||||||
|
def __init__(self, vocab_size, tone_size, hidden_size, kernel_size,
|
||||||
|
dilations):
|
||||||
|
super().__init__()
|
||||||
|
self.embedding = TextEmbedding(
|
||||||
|
vocab_size,
|
||||||
|
hidden_size,
|
||||||
|
tone_size,
|
||||||
|
padding_idx=0,
|
||||||
|
tone_padding_idx=0)
|
||||||
|
self.prenet = nn.Sequential(
|
||||||
|
nn.Linear(hidden_size, hidden_size),
|
||||||
|
nn.ReLU(), )
|
||||||
|
res_blocks = [
|
||||||
|
ResidualBlock(
|
||||||
|
hidden_size, kernel_size, d, n=2) for d in dilations
|
||||||
|
]
|
||||||
|
self.res_blocks = nn.Sequential(*res_blocks)
|
||||||
|
|
||||||
|
self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
|
||||||
|
self.postnet2 = nn.Sequential(
|
||||||
|
nn.ReLU(),
|
||||||
|
nn.BatchNorm1D(
|
||||||
|
hidden_size, data_format="NLC"),
|
||||||
|
nn.Linear(hidden_size, hidden_size), )
|
||||||
|
|
||||||
|
def forward(self, text, tones):
|
||||||
|
embedding = self.embedding(text, tones)
|
||||||
|
embedding = self.prenet(embedding)
|
||||||
|
x = self.res_blocks(embedding)
|
||||||
|
x = embedding + self.postnet1(x)
|
||||||
|
x = self.postnet2(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class DurationPredictor(nn.Layer):
|
||||||
|
def __init__(self, hidden_size):
|
||||||
|
super().__init__()
|
||||||
|
self.layers = nn.Sequential(
|
||||||
|
ResidualBlock(
|
||||||
|
hidden_size, 4, 1, n=1),
|
||||||
|
ResidualBlock(
|
||||||
|
hidden_size, 3, 1, n=1),
|
||||||
|
ResidualBlock(
|
||||||
|
hidden_size, 1, 1, n=1),
|
||||||
|
nn.Linear(hidden_size, 1))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return paddle.squeeze(self.layers(x), -1)
|
||||||
|
|
||||||
|
|
||||||
|
class SpeedySpeechDecoder(nn.Layer):
|
||||||
|
def __init__(self, hidden_size, output_size, kernel_size, dilations):
|
||||||
|
super().__init__()
|
||||||
|
res_blocks = [
|
||||||
|
ResidualBlock(
|
||||||
|
hidden_size, kernel_size, d, n=2) for d in dilations
|
||||||
|
]
|
||||||
|
self.res_blocks = nn.Sequential(*res_blocks)
|
||||||
|
|
||||||
|
self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
|
||||||
|
self.postnet2 = nn.Sequential(
|
||||||
|
ResidualBlock(
|
||||||
|
hidden_size, kernel_size, 1, n=2),
|
||||||
|
nn.Linear(hidden_size, output_size))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
xx = self.res_blocks(x)
|
||||||
|
x = x + self.postnet1(xx)
|
||||||
|
x = self.postnet2(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class SpeedySpeech(nn.Layer):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_size,
|
||||||
|
encoder_hidden_size,
|
||||||
|
encoder_kernel_size,
|
||||||
|
encoder_dilations,
|
||||||
|
duration_predictor_hidden_size,
|
||||||
|
decoder_hidden_size,
|
||||||
|
decoder_output_size,
|
||||||
|
decoder_kernel_size,
|
||||||
|
decoder_dilations,
|
||||||
|
tone_size=None, ):
|
||||||
|
super().__init__()
|
||||||
|
encoder = SpeedySpeechEncoder(vocab_size, tone_size,
|
||||||
|
encoder_hidden_size, encoder_kernel_size,
|
||||||
|
encoder_dilations)
|
||||||
|
duration_predictor = DurationPredictor(duration_predictor_hidden_size)
|
||||||
|
decoder = SpeedySpeechDecoder(decoder_hidden_size, decoder_output_size,
|
||||||
|
decoder_kernel_size, decoder_dilations)
|
||||||
|
|
||||||
|
self.encoder = encoder
|
||||||
|
self.duration_predictor = duration_predictor
|
||||||
|
self.decoder = decoder
|
||||||
|
|
||||||
|
def forward(self, text, tones, plens, durations):
|
||||||
|
encodings = self.encoder(text, tones)
|
||||||
|
pred_durations = self.duration_predictor(encodings.detach()) # (B, T)
|
||||||
|
|
||||||
|
# expand encodings
|
||||||
|
durations_to_expand = durations
|
||||||
|
encodings = expand(encodings, durations_to_expand)
|
||||||
|
|
||||||
|
# decode
|
||||||
|
# remove positional encoding here
|
||||||
|
_, t_dec, feature_size = encodings.shape
|
||||||
|
encodings += sinusoid_position_encoding(t_dec, feature_size)
|
||||||
|
decoded = self.decoder(encodings)
|
||||||
|
return decoded, pred_durations
|
||||||
|
|
||||||
|
def inference(self, text, tones):
|
||||||
|
# text: [T]
|
||||||
|
# tones: [T]
|
||||||
|
text = text.unsqueeze(0)
|
||||||
|
if tones is not None:
|
||||||
|
tones = tones.unsqueeze(0)
|
||||||
|
|
||||||
|
encodings = self.encoder(text, tones)
|
||||||
|
pred_durations = self.duration_predictor(encodings) # (1, T)
|
||||||
|
durations_to_expand = paddle.round(pred_durations.exp())
|
||||||
|
durations_to_expand = (durations_to_expand).astype(paddle.int64)
|
||||||
|
encodings = expand(encodings, durations_to_expand)
|
||||||
|
|
||||||
|
shape = paddle.shape(encodings)
|
||||||
|
t_dec, feature_size = shape[1], shape[2]
|
||||||
|
encodings += sinusoid_position_encoding(t_dec, feature_size)
|
||||||
|
decoded = self.decoder(encodings)
|
||||||
|
return decoded[0]
|
||||||
|
|
||||||
|
|
||||||
|
class SpeedySpeechInference(nn.Layer):
|
||||||
|
def __init__(self, normalizer, speedyspeech_model):
|
||||||
|
super().__init__()
|
||||||
|
self.normalizer = normalizer
|
||||||
|
self.acoustic_model = speedyspeech_model
|
||||||
|
|
||||||
|
def forward(self, phones, tones):
|
||||||
|
normalized_mel = self.acoustic_model.inference(phones, tones)
|
||||||
|
logmel = self.normalizer.inverse(normalized_mel)
|
||||||
|
return logmel
|
|
@ -403,7 +403,7 @@ class TransformerTTS(nn.Layer):
|
||||||
else:
|
else:
|
||||||
self.toned = False
|
self.toned = False
|
||||||
# position encoding matrix may be extended later
|
# position encoding matrix may be extended later
|
||||||
self.encoder_pe = pe.sinusoid_positional_encoding(0, 1000, d_encoder)
|
self.encoder_pe = pe.sinusoid_positional_encoding(1000, d_encoder)
|
||||||
self.encoder_pe_scalar = self.create_parameter(
|
self.encoder_pe_scalar = self.create_parameter(
|
||||||
[1], attr=I.Constant(1.))
|
[1], attr=I.Constant(1.))
|
||||||
self.encoder = TransformerEncoder(d_encoder, n_heads, d_ffn,
|
self.encoder = TransformerEncoder(d_encoder, n_heads, d_ffn,
|
||||||
|
@ -411,7 +411,7 @@ class TransformerTTS(nn.Layer):
|
||||||
|
|
||||||
# decoder
|
# decoder
|
||||||
self.decoder_prenet = MLPPreNet(d_mel, d_prenet, d_decoder, dropout)
|
self.decoder_prenet = MLPPreNet(d_mel, d_prenet, d_decoder, dropout)
|
||||||
self.decoder_pe = pe.sinusoid_positional_encoding(0, 1000, d_decoder)
|
self.decoder_pe = pe.sinusoid_positional_encoding(1000, d_decoder)
|
||||||
self.decoder_pe_scalar = self.create_parameter(
|
self.decoder_pe_scalar = self.create_parameter(
|
||||||
[1], attr=I.Constant(1.))
|
[1], attr=I.Constant(1.))
|
||||||
self.decoder = TransformerDecoder(
|
self.decoder = TransformerDecoder(
|
||||||
|
@ -488,7 +488,7 @@ class TransformerTTS(nn.Layer):
|
||||||
# twice its length if needed
|
# twice its length if needed
|
||||||
if x.shape[1] * self.r > self.decoder_pe.shape[0]:
|
if x.shape[1] * self.r > self.decoder_pe.shape[0]:
|
||||||
new_T = max(x.shape[1] * self.r, self.decoder_pe.shape[0] * 2)
|
new_T = max(x.shape[1] * self.r, self.decoder_pe.shape[0] * 2)
|
||||||
self.decoder_pe = pe.sinusoid_positional_encoding(0, new_T,
|
self.decoder_pe = pe.sinusoid_positional_encoding(new_T,
|
||||||
self.d_decoder)
|
self.d_decoder)
|
||||||
pos_enc = self.decoder_pe[:T_dec * self.r:self.r, :]
|
pos_enc = self.decoder_pe[:T_dec * self.r:self.r, :]
|
||||||
x = x.scale(math.sqrt(
|
x = x.scale(math.sqrt(
|
||||||
|
|
|
@ -0,0 +1,39 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import Tensor
|
||||||
|
|
||||||
|
|
||||||
|
def expand(encodings: Tensor, durations: Tensor) -> Tensor:
|
||||||
|
"""
|
||||||
|
encodings: (B, T, C)
|
||||||
|
durations: (B, T)
|
||||||
|
"""
|
||||||
|
batch_size, t_enc = durations.shape
|
||||||
|
durations = durations.numpy()
|
||||||
|
slens = np.sum(durations, -1)
|
||||||
|
t_dec = np.max(slens)
|
||||||
|
M = np.zeros([batch_size, t_dec, t_enc])
|
||||||
|
for i in range(batch_size):
|
||||||
|
k = 0
|
||||||
|
for j in range(t_enc):
|
||||||
|
d = durations[i, j]
|
||||||
|
M[i, k:k + d, j] = 1
|
||||||
|
k += d
|
||||||
|
M = paddle.to_tensor(M, dtype=encodings.dtype)
|
||||||
|
encodings = paddle.matmul(M, encodings)
|
||||||
|
return encodings
|
|
@ -0,0 +1,29 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from paddle import nn
|
||||||
|
|
||||||
|
|
||||||
|
class ZScore(nn.Layer):
|
||||||
|
# feature last
|
||||||
|
def __init__(self, mu, sigma):
|
||||||
|
super().__init__()
|
||||||
|
self.register_buffer("mu", mu)
|
||||||
|
self.register_buffer("sigma", sigma)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return (x - self.mu) / self.sigma
|
||||||
|
|
||||||
|
def inverse(self, x):
|
||||||
|
return x * self.sigma + self.mu
|
|
@ -14,47 +14,56 @@
|
||||||
|
|
||||||
import math
|
import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
|
from paddle import Tensor
|
||||||
from paddle.nn import functional as F
|
from paddle.nn import functional as F
|
||||||
|
|
||||||
__all__ = ["sinusoid_positional_encoding"]
|
__all__ = ["sinusoid_position_encoding", "scaled_position_encoding"]
|
||||||
|
|
||||||
|
|
||||||
def sinusoid_positional_encoding(start_index, length, size, dtype=None):
|
def sinusoid_position_encoding(num_positions: int,
|
||||||
r"""Generate standard positional encoding matrix.
|
feature_size: int,
|
||||||
|
omega: float=1.0,
|
||||||
.. math::
|
start_pos: int=0,
|
||||||
|
dtype=None) -> Tensor:
|
||||||
pe(pos, 2i) = sin(\frac{pos}{10000^{\frac{2i}{size}}}) \\
|
# return tensor shape (num_positions, feature_size)
|
||||||
pe(pos, 2i+1) = cos(\frac{pos}{10000^{\frac{2i}{size}}})
|
if (feature_size % 2 != 0):
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
start_index : int
|
|
||||||
The start index.
|
|
||||||
length : int
|
|
||||||
The timesteps of the positional encoding to generate.
|
|
||||||
size : int
|
|
||||||
Feature size of positional encoding.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Tensor [shape=(length, size)]
|
|
||||||
The positional encoding.
|
|
||||||
|
|
||||||
Raises
|
|
||||||
------
|
|
||||||
ValueError
|
|
||||||
If ``size`` is not divisible by 2.
|
|
||||||
"""
|
|
||||||
if (size % 2 != 0):
|
|
||||||
raise ValueError("size should be divisible by 2")
|
raise ValueError("size should be divisible by 2")
|
||||||
dtype = dtype or paddle.get_default_dtype()
|
dtype = dtype or paddle.get_default_dtype()
|
||||||
channel = np.arange(0, size, 2)
|
|
||||||
index = np.arange(start_index, start_index + length, 1)
|
channel = paddle.arange(0, feature_size, 2, dtype=dtype)
|
||||||
p = np.expand_dims(index, -1) / (10000**(channel / float(size)))
|
index = paddle.arange(start_pos, start_pos + num_positions, 1, dtype=dtype)
|
||||||
encodings = np.zeros([length, size])
|
p = (paddle.unsqueeze(index, -1) *
|
||||||
encodings[:, 0::2] = np.sin(p)
|
omega) / (10000.0**(channel / float(feature_size)))
|
||||||
encodings[:, 1::2] = np.cos(p)
|
encodings = paddle.zeros([num_positions, feature_size], dtype=dtype)
|
||||||
encodings = paddle.to_tensor(encodings)
|
encodings[:, 0::2] = paddle.sin(p)
|
||||||
|
encodings[:, 1::2] = paddle.cos(p)
|
||||||
|
return encodings
|
||||||
|
|
||||||
|
|
||||||
|
def scaled_position_encoding(num_positions: int,
|
||||||
|
feature_size: int,
|
||||||
|
omega: Tensor,
|
||||||
|
start_pos: int=0,
|
||||||
|
dtype=None) -> Tensor:
|
||||||
|
# omega: Tensor (batch_size, )
|
||||||
|
# return tensor shape (batch_size, num_positions, feature_size)
|
||||||
|
# consider renaming this as batched positioning encoding
|
||||||
|
if (feature_size % 2 != 0):
|
||||||
|
raise ValueError("size should be divisible by 2")
|
||||||
|
dtype = dtype or paddle.get_default_dtype()
|
||||||
|
|
||||||
|
channel = paddle.arange(0, feature_size, 2, dtype=dtype)
|
||||||
|
index = paddle.arange(
|
||||||
|
start_pos, start_pos + num_positions, 1, dtype=omega.dtype)
|
||||||
|
batch_size = omega.shape[0]
|
||||||
|
omega = paddle.unsqueeze(omega, [1, 2])
|
||||||
|
p = (paddle.unsqueeze(index, -1) *
|
||||||
|
omega) / (10000.0**(channel / float(feature_size)))
|
||||||
|
encodings = paddle.zeros(
|
||||||
|
[batch_size, num_positions, feature_size], dtype=dtype)
|
||||||
|
# it is nice to have fancy indexing and inplace operations
|
||||||
|
encodings[:, :, 0::2] = paddle.sin(p)
|
||||||
|
encodings[:, :, 1::2] = paddle.cos(p)
|
||||||
return encodings
|
return encodings
|
||||||
|
|
|
@ -0,0 +1,84 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from math import exp
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
import paddle.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
def gaussian(window_size, sigma):
|
||||||
|
gauss = paddle.to_tensor([
|
||||||
|
exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
|
||||||
|
for x in range(window_size)
|
||||||
|
])
|
||||||
|
return gauss / gauss.sum()
|
||||||
|
|
||||||
|
|
||||||
|
def create_window(window_size, channel):
|
||||||
|
_1D_window = gaussian(window_size, 1.5).unsqueeze(1)
|
||||||
|
_2D_window = paddle.matmul(_1D_window,
|
||||||
|
paddle.transpose(_1D_window,
|
||||||
|
[1, 0])).unsqueeze([0, 1])
|
||||||
|
window = paddle.expand(_2D_window, [channel, 1, window_size, window_size])
|
||||||
|
return window
|
||||||
|
|
||||||
|
|
||||||
|
def _ssim(img1, img2, window, window_size, channel, size_average=True):
|
||||||
|
mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
|
||||||
|
mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
|
||||||
|
|
||||||
|
mu1_sq = mu1.pow(2)
|
||||||
|
mu2_sq = mu2.pow(2)
|
||||||
|
mu1_mu2 = mu1 * mu2
|
||||||
|
|
||||||
|
sigma1_sq = F.conv2d(
|
||||||
|
img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
|
||||||
|
sigma2_sq = F.conv2d(
|
||||||
|
img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
|
||||||
|
sigma12 = F.conv2d(
|
||||||
|
img1 * img2, window, padding=window_size // 2,
|
||||||
|
groups=channel) - mu1_mu2
|
||||||
|
|
||||||
|
C1 = 0.01**2
|
||||||
|
C2 = 0.03**2
|
||||||
|
|
||||||
|
ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) \
|
||||||
|
/ ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
|
||||||
|
|
||||||
|
if size_average:
|
||||||
|
return ssim_map.mean()
|
||||||
|
else:
|
||||||
|
return ssim_map.mean(1).mean(1).mean(1)
|
||||||
|
|
||||||
|
|
||||||
|
class SSIM(nn.Layer):
|
||||||
|
def __init__(self, window_size=11, size_average=True):
|
||||||
|
super().__init__()
|
||||||
|
self.window_size = window_size
|
||||||
|
self.size_average = size_average
|
||||||
|
self.channel = 1
|
||||||
|
self.window = create_window(window_size, self.channel)
|
||||||
|
|
||||||
|
def forward(self, img1, img2):
|
||||||
|
return _ssim(img1, img2, self.window, self.window_size, self.channel,
|
||||||
|
self.size_average)
|
||||||
|
|
||||||
|
|
||||||
|
def ssim(img1, img2, window_size=11, size_average=True):
|
||||||
|
(_, channel, _, _) = img1.shape
|
||||||
|
window = create_window(window_size, channel)
|
||||||
|
return _ssim(img1, img2, window, window_size, channel, size_average)
|
|
@ -123,8 +123,6 @@ class Trainer(object):
|
||||||
update = self.updater.update # training step
|
update = self.updater.update # training step
|
||||||
stop_trigger = self.stop_trigger
|
stop_trigger = self.stop_trigger
|
||||||
|
|
||||||
print(self.updater.state)
|
|
||||||
|
|
||||||
# display only one progress bar
|
# display only one progress bar
|
||||||
max_iteration = None
|
max_iteration = None
|
||||||
if isinstance(stop_trigger, LimitTrigger):
|
if isinstance(stop_trigger, LimitTrigger):
|
||||||
|
|
|
@ -12,6 +12,8 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
|
|
||||||
class IntervalTrigger(object):
|
class IntervalTrigger(object):
|
||||||
"""A Predicate to do something every N cycle."""
|
"""A Predicate to do something every N cycle."""
|
||||||
|
@ -23,9 +25,16 @@ class IntervalTrigger(object):
|
||||||
raise ValueError("period should be a positive integer.")
|
raise ValueError("period should be a positive integer.")
|
||||||
self.period = period
|
self.period = period
|
||||||
self.unit = unit
|
self.unit = unit
|
||||||
|
self.last_index = None
|
||||||
|
|
||||||
def __call__(self, trainer):
|
def __call__(self, trainer):
|
||||||
state = trainer.updater.state
|
if self.last_index is None:
|
||||||
index = getattr(state, self.unit)
|
last_index = getattr(trainer.updater.state, self.unit)
|
||||||
fire = index % self.period == 0
|
self.last_index = last_index
|
||||||
|
|
||||||
|
last_index = self.last_index
|
||||||
|
index = getattr(trainer.updater.state, self.unit)
|
||||||
|
fire = index // self.period != last_index // self.period
|
||||||
|
|
||||||
|
self.last_index = index
|
||||||
return fire
|
return fire
|
||||||
|
|
|
@ -106,8 +106,8 @@ class StandardUpdater(UpdaterBase):
|
||||||
self.update_core(batch)
|
self.update_core(batch)
|
||||||
|
|
||||||
self.state.iteration += 1
|
self.state.iteration += 1
|
||||||
if self.updaters_per_epoch is not None:
|
if self.updates_per_epoch is not None:
|
||||||
if self.state.iteration % self.updaters_per_epoch == 0:
|
if self.state.iteration % self.updates_per_epoch == 0:
|
||||||
self.state.epoch += 1
|
self.state.epoch += 1
|
||||||
|
|
||||||
def update_core(self, batch):
|
def update_core(self, batch):
|
||||||
|
@ -139,7 +139,7 @@ class StandardUpdater(UpdaterBase):
|
||||||
self.optimizer.update()
|
self.optimizer.update()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def updaters_per_epoch(self):
|
def updates_per_epoch(self):
|
||||||
"""Number of updater per epoch, determined by the length of the
|
"""Number of updater per epoch, determined by the length of the
|
||||||
dataloader."""
|
dataloader."""
|
||||||
length_of_dataloader = None
|
length_of_dataloader = None
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from parakeet.modules import expansion
|
||||||
|
|
||||||
|
|
||||||
|
def test_expand():
|
||||||
|
x = paddle.randn([2, 4, 3]) # (B, T, C)
|
||||||
|
lengths = paddle.to_tensor([[1, 2, 2, 1], [3, 1, 4, 0]])
|
||||||
|
y = expansion.expand(x, lengths)
|
||||||
|
|
||||||
|
assert y.shape == [2, 8, 3]
|
||||||
|
print("the first sequence")
|
||||||
|
print(y[0])
|
||||||
|
|
||||||
|
print("the second sequence")
|
||||||
|
print(y[1])
|
|
@ -0,0 +1,34 @@
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import math
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle.jit import to_static
|
||||||
|
from paddle.static import InputSpec
|
||||||
|
|
||||||
|
|
||||||
|
def test_applicative_evaluation():
|
||||||
|
def m_sqrt2(x):
|
||||||
|
return paddle.scale(x, math.sqrt(2))
|
||||||
|
|
||||||
|
subgraph = to_static(m_sqrt2, input_spec=[InputSpec([-1])])
|
||||||
|
paddle.jit.save(subgraph, './temp_test_to_static')
|
||||||
|
|
||||||
|
fn = paddle.jit.load('./temp_test_to_static')
|
||||||
|
x = paddle.arange(10, dtype=paddle.float32)
|
||||||
|
y = fn(x)
|
||||||
|
|
||||||
|
print(x)
|
||||||
|
print(y)
|
Loading…
Reference in New Issue