Merge pull request #129 from iclementine/speedyspeech

add speedyspeech model and example with baker dataset.
2021-07-19 03:32:51 -05:00 · 2021-07-19 03:32:51 -05:00 · 25788ab2ca
parent 124dedbd7b 4ba8e7e342
commit 25788ab2ca
36 changed files with 2125 additions and 59 deletions
--- a/examples/parallelwave_gan/baker/conf/default.yaml
+++ b/examples/parallelwave_gan/baker/conf/default.yaml
@ -6,18 +6,18 @@
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
-sr: 24000     # Sampling rate.
+sr: 24000                # Sampling rate.
-n_fft: 2048           # FFT size.
+n_fft: 2048              # FFT size (in samples).
-hop_length: 300            # Hop size.
+hop_length: 300          # Hop size (in samples).
-win_length: 1200         # Window length.
+win_length: 1200         # Window length (in samples).
                         # If set to null, it will be the same as fft_size.
 window: "hann"           # Window function.
-n_mels: 80             # Number of mel basis.
+n_mels: 80               # Number of mel basis.
-fmin: 80                 # Minimum freq in mel basis calculation.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
-fmax: 7600               # Maximum frequency in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
 # global_gain_scale: 1.0   # Will be multiplied to all of waveform.
 trim_silence: false      # Whether to trim the start and end of silence.
-top_db: 60 # Need to tune carefully if the recording is not good.
+top_db: 60               # Need to tune carefully if the recording is not good.
 trim_frame_length: 2048    # Frame size in trimming.(in samples)
 trim_hop_length: 512       # Hop size in trimming.(in samples)
--- a/examples/parallelwave_gan/baker/preprocess.py
+++ b/examples/parallelwave_gan/baker/preprocess.py
@ -202,14 +202,12 @@ def process_sentences(config,
 def main():
    # parse config and args
    parser = argparse.ArgumentParser(
-        description="Preprocess audio and then extract features (See detail in parallel_wavegan/bin/preprocess.py)."
+        description="Preprocess audio and then extract features .")
    )
    parser.add_argument(
        "--rootdir",
        default=None,
        type=str,
-        help="directory including wav files. you need to specify either scp or rootdir."
+        help="directory to baker dataset.")
    )
    parser.add_argument(
        "--dumpdir",
        type=str,
--- a/examples/parallelwave_gan/baker/preprocess.sh
+++ b/examples/parallelwave_gan/baker/preprocess.sh
@ -0,0 +1,6 @@
 python preprocess.py --rootdir=~/datasets/BZNSYP/ --dumpdir=dump --num_cpu=20
 python compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" --dumpdir=dump/train
 python normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/stats.npy
 python normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/stats.npy
 python normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/stats.npy
--- a/examples/parallelwave_gan/baker/run.sh
+++ b/examples/parallelwave_gan/baker/run.sh
@ -0,0 +1,9 @@
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
 python train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=conf/default.yaml \
    --output-dir=exp/default \
    --nprocs=1
--- a/examples/speedyspeech/baker/batch_fn.py
+++ b/examples/speedyspeech/baker/batch_fn.py
@ -0,0 +1,43 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 from parakeet.data.batch import batch_sequences
 def collate_baker_examples(examples):
    # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"]
    phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
    tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
    feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
    durations = [
        np.array(
            item["durations"], dtype=np.int64) for item in examples
    ]
    num_phones = np.array([item["num_phones"] for item in examples])
    num_frames = np.array([item["num_frames"] for item in examples])
    phones = batch_sequences(phones)
    tones = batch_sequences(tones)
    feats = batch_sequences(feats)
    durations = batch_sequences(durations)
    batch = {
        "phones": phones,
        "tones": tones,
        "num_phones": num_phones,
        "num_frames": num_frames,
        "feats": feats,
        "durations": durations,
    }
    return batch
--- a/examples/speedyspeech/baker/compute_statistics.py
+++ b/examples/speedyspeech/baker/compute_statistics.py
@ -0,0 +1,109 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Calculate statistics of feature files."""
 import argparse
 import logging
 import os
 from pathlib import Path
 import numpy as np
 import yaml
 import json
 import jsonlines
 from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm
 from parakeet.datasets.data_table import DataTable
 from parakeet.utils.h5_utils import read_hdf5
 from parakeet.utils.h5_utils import write_hdf5
 from config import get_cfg_default
 def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description="Compute mean and variance of dumped raw features.")
    parser.add_argument(
        "--metadata", type=str, help="json file with id and file paths ")
    parser.add_argument(
        "--field-name",
        type=str,
        help="name of the field to compute statistics for.")
    parser.add_argument(
        "--config", type=str, help="yaml format configuration file.")
    parser.add_argument(
        "--output",
        type=str,
        help="path to save statistics. if not provided, "
        "stats will be saved in the above root directory with name stats.npy")
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)")
    args = parser.parse_args()
    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
        )
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
        )
        logging.warning('Skip DEBUG/INFO messages')
    config = get_cfg_default()
    # load config
    if args.config:
        config.merge_from_file(args.config)
    # check directory existence
    if args.output is None:
        args.output = Path(args.metadata).parent.with_name("stats.npy")
    else:
        args.output = Path(args.output)
    args.output.parent.mkdir(parents=True, exist_ok=True)
    with jsonlines.open(args.metadata, 'r') as reader:
        metadata = list(reader)
    dataset = DataTable(
        metadata,
        fields=[args.field_name],
        converters={args.field_name: np.load}, )
    logging.info(f"The number of files = {len(dataset)}.")
    # calculate statistics
    scaler = StandardScaler()
    for datum in tqdm(dataset):
        # StandardScalar supports (*, num_features) by default
        scaler.partial_fit(datum[args.field_name])
    stats = np.stack([scaler.mean_, scaler.scale_], axis=0)
    np.save(str(args.output), stats.astype(np.float32), allow_pickle=False)
 if __name__ == "__main__":
    main()
--- a/examples/speedyspeech/baker/conf/default.yaml
+++ b/examples/speedyspeech/baker/conf/default.yaml
@ -0,0 +1,60 @@
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
 sr: 24000     # Sampling rate.
 n_fft: 2048           # FFT size.
 hop_length: 300            # Hop size.
 win_length: 1200         # Window length.
                         # If set to null, it will be the same as fft_size.
 window: "hann"           # Window function.
 n_mels: 80             # Number of mel basis.
 fmin: 80                 # Minimum freq in mel basis calculation.
 fmax: 7600               # Maximum frequency in mel basis calculation.
 # global_gain_scale: 1.0   # Will be multiplied to all of waveform.
 trim_silence: false      # Whether to trim the start and end of silence.
 top_db: 60 # Need to tune carefully if the recording is not good.
 trim_frame_length: 2048    # Frame size in trimming.(in samples)
 trim_hop_length: 512       # Hop size in trimming.(in samples)
 ###########################################################
 #                       DATA SETTING                      #
 ###########################################################
 batch_size: 32
 num_workers: 4
 ###########################################################
 #                       MODEL SETTING                     #
 ###########################################################
 model:
  vocab_size: 101 # 99 + 2
  tone_size: 8 # 6 + 2
  encoder_hidden_size: 128
  encoder_kernel_size: 3
  encoder_dilations: [1, 3, 9, 27, 1, 3, 9, 27, 1, 1]
  duration_predictor_hidden_size: 128
  decoder_hidden_size: 128
  decoder_output_size: 80
  decoder_kernel_size: 3
  decoder_dilations: [1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 1]
 ###########################################################
 #                     OPTIMIZER SETTING                   #
 ###########################################################
 ###########################################################
 #                     TRAINING SETTING                    #
 ###########################################################
 max_epoch: 300
 num_snapshots: 5
 ###########################################################
 #                       OTHER SETTING                     #
 ###########################################################
 seed: 10086
--- a/examples/speedyspeech/baker/config.py
+++ b/examples/speedyspeech/baker/config.py
@ -0,0 +1,25 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import yaml
 from yacs.config import CfgNode as Configuration
 with open("conf/default.yaml", 'rt') as f:
    _C = yaml.safe_load(f)
    _C = Configuration(_C)
 def get_cfg_default():
    config = _C.clone()
    return config
--- a/examples/speedyspeech/baker/frontend.py
+++ b/examples/speedyspeech/baker/frontend.py
@ -0,0 +1,92 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
 import numpy as np
 import paddle
 import pypinyin
 from pypinyin import lazy_pinyin, Style
 import jieba
 import phkit
 phkit.initialize()
 from parakeet.frontend.vocab import Vocab
 with open("phones.txt", 'rt') as f:
    phones = [line.strip() for line in f.readlines()]
 with open("tones.txt", 'rt') as f:
    tones = [line.strip() for line in f.readlines()]
 voc_phones = Vocab(phones, start_symbol=None, end_symbol=None)
 voc_tones = Vocab(tones, start_symbol=None, end_symbol=None)
 def segment(sentence):
    segments = re.split(r'[：，；。？！]', sentence)
    segments = [seg for seg in segments if len(seg)]
    return segments
 def g2p(sentence):
    segments = segment(sentence)
    phones = []
    phones.append('sil')
    tones = []
    tones.append('0')
    for seg in segments:
        seg = jieba.lcut(seg)
        initials = lazy_pinyin(
            seg, neutral_tone_with_five=True, style=Style.INITIALS)
        finals = lazy_pinyin(
            seg, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
        for c, v in zip(initials, finals):
            # NOTE: post process for pypinyin outputs
            # we discriminate i, ii and iii
            if re.match(r'i\d', v):
                if c in ['z', 'c', 's']:
                    v = re.sub('i', 'ii', v)
                elif c in ['zh', 'ch', 'sh', 'r']:
                    v = re.sub('i', 'iii', v)
            if c:
                phones.append(c)
                tones.append('0')
            if v:
                phones.append(v[:-1])
                tones.append(v[-1])
        phones.append('sp')
        tones.append('0')
    phones[-1] = 'sil'
    tones[-1] = '0'
    return (phones, tones)
 def p2id(voc, phonemes):
    phone_ids = [voc.lookup(item) for item in phonemes]
    return np.array(phone_ids, np.int64)
 def t2id(voc, tones):
    tone_ids = [voc.lookup(item) for item in tones]
    return np.array(tone_ids, np.int64)
 def text_analysis(sentence):
    phonemes, tones = g2p(sentence)
    print(sentence)
    print([p + t if t != '0' else p for p, t in zip(phonemes, tones)])
    phone_ids = p2id(voc_phones, phonemes)
    tone_ids = t2id(voc_tones, tones)
    phones = paddle.to_tensor(phone_ids)
    tones = paddle.to_tensor(tone_ids)
    return phones, tones
--- a/examples/speedyspeech/baker/normalize.py
+++ b/examples/speedyspeech/baker/normalize.py
@ -0,0 +1,150 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Normalize feature files and dump them."""
 import argparse
 import logging
 import os
 from copy import copy
 from operator import itemgetter
 from pathlib import Path
 import numpy as np
 import yaml
 import jsonlines
 from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm
 from parakeet.frontend.vocab import Vocab
 from parakeet.datasets.data_table import DataTable
 from config import get_cfg_default
 def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
    )
    parser.add_argument(
        "--metadata",
        type=str,
        required=True,
        help="directory including feature files to be normalized. "
        "you need to specify either *-scp or rootdir.")
    parser.add_argument(
        "--dumpdir",
        type=str,
        required=True,
        help="directory to dump normalized feature files.")
    parser.add_argument(
        "--stats", type=str, required=True, help="statistics file.")
    parser.add_argument(
        "--phones",
        type=str,
        default="phones.txt",
        help="phone vocabulary file.")
    parser.add_argument(
        "--tones", type=str, default="tones.txt", help="tone vocabulary file.")
    parser.add_argument(
        "--config", type=str, help="yaml format configuration file.")
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)")
    args = parser.parse_args()
    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
        )
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
        )
        logging.warning('Skip DEBUG/INFO messages')
    # load config
    config = get_cfg_default()
    if args.config:
        config.merge_from_file(args.config)
    # check directory existence
    dumpdir = Path(args.dumpdir).resolve()
    dumpdir.mkdir(parents=True, exist_ok=True)
    # get dataset
    with jsonlines.open(args.metadata, 'r') as reader:
        metadata = list(reader)
    dataset = DataTable(metadata, converters={'feats': np.load, })
    logging.info(f"The number of files = {len(dataset)}.")
    # restore scaler
    scaler = StandardScaler()
    scaler.mean_ = np.load(args.stats)[0]
    scaler.scale_ = np.load(args.stats)[1]
    # from version 0.23.0, this information is needed
    scaler.n_features_in_ = scaler.mean_.shape[0]
    with open(args.phones, 'rt') as f:
        phones = [line.strip() for line in f.readlines()]
    with open(args.tones, 'rt') as f:
        tones = [line.strip() for line in f.readlines()]
    voc_phones = Vocab(phones, start_symbol=None, end_symbol=None)
    voc_tones = Vocab(tones, start_symbol=None, end_symbol=None)
    # process each file
    output_metadata = []
    for item in tqdm(dataset):
        utt_id = item['utt_id']
        mel = item['feats']
        # normalize
        mel = scaler.transform(mel)
        # save
        mel_path = dumpdir / f"{utt_id}-feats.npy"
        np.save(mel_path, mel.astype(np.float32), allow_pickle=False)
        phone_ids = [voc_phones.lookup(p) for p in item['phones']]
        tone_ids = [voc_tones.lookup(t) for t in item['tones']]
        output_metadata.append({
            'utt_id': utt_id,
            'phones': phone_ids,
            'tones': tone_ids,
            'num_phones': item['num_phones'],
            'num_frames': item['num_frames'],
            'durations': item['durations'],
            'feats': str(mel_path),
        })
    output_metadata.sort(key=itemgetter('utt_id'))
    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
    with jsonlines.open(output_metadata_path, 'w') as writer:
        for item in output_metadata:
            writer.write(item)
    logging.info(f"metadata dumped into {output_metadata_path}")
 if __name__ == "__main__":
    main()
--- a/examples/speedyspeech/baker/phones.txt
+++ b/examples/speedyspeech/baker/phones.txt
@ -0,0 +1,99 @@
 b
 p
 m
 f
 d
 t
 n
 l
 g
 k
 h
 zh
 ch
 sh
 r
 z
 c
 s
 j
 q
 x
 a
 ar
 ai
 air
 ao
 aor
 an
 anr
 ang
 angr
 e
 er
 ei
 eir
 en
 enr
 eng
 engr
 o
 or
 ou
 our
 ong
 ongr
 ii
 iir
 iii
 iiir
 i
 ir
 ia
 iar
 iao
 iaor
 ian
 ianr
 iang
 iangr
 ie
 ier
 io
 ior
 iou
 iour
 iong
 iongr
 in
 inr
 ing
 ingr
 u
 ur
 ua
 uar
 uai
 uair
 uan
 uanr
 uang
 uangr
 uei
 ueir
 uo
 uor
 uen
 uenr
 ueng
 uengr
 v
 vr
 ve
 ver
 van
 vanr
 vn
 vnr
 sil
 sp
--- a/examples/speedyspeech/baker/preprocess.py
+++ b/examples/speedyspeech/baker/preprocess.py
@ -0,0 +1,309 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List, Dict, Any
 import soundfile as sf
 import librosa
 import numpy as np
 import argparse
 import yaml
 import json
 import re
 import jsonlines
 import concurrent.futures
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from pathlib import Path
 import tqdm
 from operator import itemgetter
 from praatio import tgio
 import logging
 from config import get_cfg_default
 from tg_utils import validate_textgrid
 def logmelfilterbank(audio,
                     sr,
                     n_fft=1024,
                     hop_length=256,
                     win_length=None,
                     window="hann",
                     n_mels=80,
                     fmin=None,
                     fmax=None,
                     eps=1e-10):
    """Compute log-Mel filterbank feature.
    Parameters
    ----------
    audio : ndarray
        Audio signal (T,).
    sr : int
        Sampling rate.
    n_fft : int
        FFT size. (Default value = 1024)
    hop_length : int
        Hop size. (Default value = 256)
    win_length : int
        Window length. If set to None, it will be the same as fft_size. (Default value = None)
    window : str
        Window function type. (Default value = "hann")
    n_mels : int
        Number of mel basis. (Default value = 80)
    fmin : int
        Minimum frequency in mel basis calculation. (Default value = None)
    fmax : int
        Maximum frequency in mel basis calculation. (Default value = None)
    eps : float
        Epsilon value to avoid inf in log calculation. (Default value = 1e-10)
    Returns
    -------
    np.ndarray
        Log Mel filterbank feature (#frames, num_mels).
    """
    # get amplitude spectrogram
    x_stft = librosa.stft(
        audio,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        window=window,
        pad_mode="reflect")
    spc = np.abs(x_stft)  # (#bins, #frames,)
    # get mel basis
    fmin = 0 if fmin is None else fmin
    fmax = sr / 2 if fmax is None else fmax
    mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
    return np.log10(np.maximum(eps, np.dot(mel_basis, spc)))
 def process_sentence(config: Dict[str, Any],
                     fp: Path,
                     alignment_fp: Path,
                     output_dir: Path):
    utt_id = fp.stem
    # reading
    y, sr = librosa.load(fp, sr=config.sr)  # resampling may occur
    assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
    assert np.abs(y).max(
    ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
    duration = librosa.get_duration(y, sr=sr)
    # intervals with empty lables are ignored
    alignment = tgio.openTextgrid(alignment_fp)
    # validate text grid against audio file
    num_samples = y.shape[0]
    validate_textgrid(alignment, num_samples, sr)
    # only with baker's annotation
    intervals = alignment.tierDict[alignment.tierNameList[0]].entryList
    first, last = intervals[0], intervals[-1]
    if not (first.label == "sil" and first.end < duration):
        logging.warning(
            f" There is something wrong with the fisrt interval {first} in utterance: {utt_id}"
        )
    if not (last.label == "sil" and last.start < duration):
        logging.warning(
            f" There is something wrong with the last interval {last} in utterance: {utt_id}"
        )
    logmel = logmelfilterbank(
        y,
        sr=sr,
        n_fft=config.n_fft,
        window=config.window,
        win_length=config.win_length,
        hop_length=config.hop_length,
        n_mels=config.n_mels,
        fmin=config.fmin,
        fmax=config.fmax)
    # extract phone and duration
    phones = []
    tones = []
    ends = []
    durations_sec = []
    for interval in intervals:
        label = interval.label
        label = label.replace("sp1", "sp")  # Baker has sp1 rather than sp
        # split tone from finals
        match = re.match(r'^(\w+)([012345])$', label)
        if match:
            phones.append(match.group(1))
            tones.append(match.group(2))
        else:
            phones.append(label)
            tones.append('0')
        end = min(duration, interval.end)
        ends.append(end)
        durations_sec.append(end - interval.start)  # duration in seconds
    frame_pos = librosa.time_to_frames(
        ends, sr=sr, hop_length=config.hop_length)
    durations_frame = np.diff(frame_pos, prepend=0)
    num_frames = logmel.shape[-1]  # number of frames of the spectrogram
    extra = np.sum(durations_frame) - num_frames
    assert extra <= 0, (
        f"Number of frames inferred from alignemnt is "
        f"larger than number of frames of the spectrogram by {extra} frames")
    durations_frame[-1] += (-extra)
    assert np.sum(durations_frame) == num_frames
    durations_frame = durations_frame.tolist()
    mel_path = output_dir / (utt_id + "_feats.npy")
    np.save(mel_path, logmel.T)  # (num_frames, n_mels)
    record = {
        "utt_id": utt_id,
        "phones": phones,
        "tones": tones,
        "num_phones": len(phones),
        "num_frames": num_frames,
        "durations": durations_frame,
        "feats": str(mel_path.resolve()),  # use absolute path
    }
    return record
 def process_sentences(config,
                      fps: List[Path],
                      alignment_fps: List[Path],
                      output_dir: Path,
                      nprocs: int=1):
    if nprocs == 1:
        results = []
        for fp, alignment_fp in tqdm.tqdm(
                zip(fps, alignment_fps), total=len(fps)):
            results.append(
                process_sentence(config, fp, alignment_fp, output_dir))
    else:
        with ThreadPoolExecutor(nprocs) as pool:
            futures = []
            with tqdm.tqdm(total=len(fps)) as progress:
                for fp, alignment_fp in zip(fps, alignment_fps):
                    future = pool.submit(process_sentence, config, fp,
                                         alignment_fp, output_dir)
                    future.add_done_callback(lambda p: progress.update())
                    futures.append(future)
                results = []
                for ft in futures:
                    results.append(ft.result())
    results.sort(key=itemgetter("utt_id"))
    with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
        for item in results:
            writer.write(item)
    print("Done")
 def main():
    # parse config and args
    parser = argparse.ArgumentParser(
        description="Preprocess audio and then extract features.")
    parser.add_argument(
        "--rootdir",
        default=None,
        type=str,
        help="directory to baker dataset.")
    parser.add_argument(
        "--dumpdir",
        type=str,
        required=True,
        help="directory to dump feature files.")
    parser.add_argument(
        "--config", type=str, help="yaml format configuration file.")
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)")
    parser.add_argument(
        "--num_cpu", type=int, default=1, help="number of process.")
    args = parser.parse_args()
    C = get_cfg_default()
    if args.config:
        C.merge_from_file(args.config)
        C.freeze()
    if args.verbose > 1:
        print(vars(args))
        print(C)
    root_dir = Path(args.rootdir).expanduser()
    dumpdir = Path(args.dumpdir).expanduser()
    dumpdir.mkdir(parents=True, exist_ok=True)
    wav_files = sorted(list((root_dir / "Wave").rglob("*.wav")))
    alignment_files = sorted(
        list((root_dir / "PhoneLabeling").rglob("*.interval")))
    # filter out several files that have errors in annotation
    exclude = {'000611', '000662', '002365', '005107'}
    wav_files = [f for f in wav_files if f.stem not in exclude]
    alignment_files = [f for f in alignment_files if f.stem not in exclude]
    # split data into 3 sections
    num_train = 9800
    num_dev = 100
    train_wav_files = wav_files[:num_train]
    dev_wav_files = wav_files[num_train:num_train + num_dev]
    test_wav_files = wav_files[num_train + num_dev:]
    train_alignment_files = alignment_files[:num_train]
    dev_alignment_files = alignment_files[num_train:num_train + num_dev]
    test_alignment_files = alignment_files[num_train + num_dev:]
    train_dump_dir = dumpdir / "train" / "raw"
    train_dump_dir.mkdir(parents=True, exist_ok=True)
    dev_dump_dir = dumpdir / "dev" / "raw"
    dev_dump_dir.mkdir(parents=True, exist_ok=True)
    test_dump_dir = dumpdir / "test" / "raw"
    test_dump_dir.mkdir(parents=True, exist_ok=True)
    # process for the 3 sections
    process_sentences(
        C,
        train_wav_files,
        train_alignment_files,
        train_dump_dir,
        nprocs=args.num_cpu)
    process_sentences(
        C,
        dev_wav_files,
        dev_alignment_files,
        dev_dump_dir,
        nprocs=args.num_cpu)
    process_sentences(
        C,
        test_wav_files,
        test_alignment_files,
        test_dump_dir,
        nprocs=args.num_cpu)
 if __name__ == "__main__":
    main()
--- a/examples/speedyspeech/baker/preprocess.sh
+++ b/examples/speedyspeech/baker/preprocess.sh
@ -0,0 +1,6 @@
 python preprocess.py --rootdir=~/datasets/BZNSYP/ --dumpdir=dump --num_cpu=20
 python compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" --output=dump/train/stats.npy
 python normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/stats.npy
 python normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/stats.npy
 python normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/stats.npy
--- a/examples/speedyspeech/baker/run.sh
+++ b/examples/speedyspeech/baker/run.sh
@ -0,0 +1,6 @@
 python train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=conf/default.yaml \
    --output-dir=exp/default \
    --nprocs=1
--- a/examples/speedyspeech/baker/sentences.txt
+++ b/examples/speedyspeech/baker/sentences.txt
@ -0,0 +1,16 @@
 001 凯莫瑞安联合体的经济崩溃，迫在眉睫。
 002 对于所有想要离开那片废土，去寻找更美好生活的人来说。
 003 克哈，是你们所有人安全的港湾。
 004 为了保护尤摩扬人民不受异虫的残害，我所做的，比他们自己的领导委员会都多。
 005 无论他们如何诽谤我，我将继续为所有泰伦人的最大利益，而努力奋斗。
 006 身为你们的元首，我带领泰伦人实现了人类统治领地和经济的扩张。
 007 我们将继续成长，用行动回击那些只会说风凉话，不愿意和我们相向而行的害群之马。
 008 帝国武装力量，无数的优秀儿女，正时刻守卫着我们的家园大门，但是他们孤木难支。
 009 凡是今天应征入伍者，所获的所有刑罚罪责，减半。
 010 激进分子和异见者希望你们一听见枪声，就背弃多年的和平与繁荣。
 011 他们没有勇气和能力，带领人类穿越一个充满危险的星系。
 012 法治是我们的命脉，然而它却受到前所未有的挑战。
 013 我将恢复我们帝国的荣光，绝不会向任何外星势力低头。
 014 我已经驯服了异虫，荡平了星灵。如今它们的创造者，想要夺走我们拥有的一切。
 015 永远记住，谁才是最能保护你们的人。
 016 不要听信别人的谗言，我不是什么克隆人。
--- a/examples/speedyspeech/baker/speedyspeech_updater.py
+++ b/examples/speedyspeech/baker/speedyspeech_updater.py
@ -0,0 +1,110 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 from paddle.nn import functional as F
 from paddle.fluid.layers import huber_loss
 from parakeet.modules.ssim import ssim
 from parakeet.modules.losses import masked_l1_loss, weighted_mean
 from parakeet.training.reporter import report
 from parakeet.training.updaters.standard_updater import StandardUpdater
 from parakeet.training.extensions.evaluator import StandardEvaluator
 from parakeet.models.speedyspeech import SpeedySpeech
 class SpeedySpeechUpdater(StandardUpdater):
    def update_core(self, batch):
        decoded, predicted_durations = self.model(
            text=batch["phones"],
            tones=batch["tones"],
            plens=batch["num_phones"],
            durations=batch["durations"])
        target_mel = batch["feats"]
        spec_mask = F.sequence_mask(
            batch["num_frames"], dtype=target_mel.dtype).unsqueeze(-1)
        text_mask = F.sequence_mask(
            batch["num_phones"], dtype=predicted_durations.dtype)
        # spec loss
        l1_loss = masked_l1_loss(decoded, target_mel, spec_mask)
        # duration loss
        target_durations = batch["durations"]
        target_durations = paddle.maximum(
            target_durations.astype(predicted_durations.dtype),
            paddle.to_tensor([1.0]))
        duration_loss = weighted_mean(
            huber_loss(
                predicted_durations, paddle.log(target_durations), delta=1.0),
            text_mask, )
        # ssim loss
        ssim_loss = 1.0 - ssim((decoded * spec_mask).unsqueeze(1),
                               (target_mel * spec_mask).unsqueeze(1))
        loss = l1_loss + ssim_loss + duration_loss
        optimizer = self.optimizer
        optimizer.clear_grad()
        loss.backward()
        optimizer.step()
        report("train/loss", float(loss))
        report("train/l1_loss", float(l1_loss))
        report("train/duration_loss", float(duration_loss))
        report("train/ssim_loss", float(ssim_loss))
 class SpeedySpeechEvaluator(StandardEvaluator):
    def evaluate_core(self, batch):
        print("fire")
        decoded, predicted_durations = self.model(
            text=batch["phones"],
            tones=batch["tones"],
            plens=batch["num_phones"],
            durations=batch["durations"])
        target_mel = batch["feats"]
        spec_mask = F.sequence_mask(
            batch["num_frames"], dtype=target_mel.dtype).unsqueeze(-1)
        text_mask = F.sequence_mask(
            batch["num_phones"], dtype=predicted_durations.dtype)
        # spec loss
        l1_loss = masked_l1_loss(decoded, target_mel, spec_mask)
        # duration loss
        target_durations = batch["durations"]
        target_durations = paddle.maximum(
            target_durations.astype(predicted_durations.dtype),
            paddle.to_tensor([1.0]))
        duration_loss = weighted_mean(
            huber_loss(
                predicted_durations, paddle.log(target_durations), delta=1.0),
            text_mask, )
        # ssim loss
        ssim_loss = 1.0 - ssim((decoded * spec_mask).unsqueeze(1),
                               (target_mel * spec_mask).unsqueeze(1))
        loss = l1_loss + ssim_loss + duration_loss
        # import pdb; pdb.set_trace()
        report("eval/loss", float(loss))
        report("eval/l1_loss", float(l1_loss))
        report("eval/duration_loss", float(duration_loss))
        report("eval/ssim_loss", float(ssim_loss))
--- a/examples/speedyspeech/baker/synthesize.py
+++ b/examples/speedyspeech/baker/synthesize.py
@ -0,0 +1,146 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import sys
 import logging
 import argparse
 import dataclasses
 from pathlib import Path
 import yaml
 import jsonlines
 import paddle
 import numpy as np
 import soundfile as sf
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from paddle import distributed as dist
 from yacs.config import CfgNode
 from parakeet.datasets.data_table import DataTable
 from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference
 from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
 from parakeet.modules.normalizer import ZScore
 def evaluate(args, speedyspeech_config, pwg_config):
    # dataloader has been too verbose
    logging.getLogger("DataLoader").disabled = True
    # construct dataset for evaluation
    with jsonlines.open(args.test_metadata, 'r') as reader:
        test_metadata = list(reader)
    test_dataset = DataTable(
        data=test_metadata, fields=["utt_id", "phones", "tones"])
    model = SpeedySpeech(**speedyspeech_config["model"])
    model.set_state_dict(
        paddle.load(args.speedyspeech_checkpoint)["main_params"])
    model.eval()
    vocoder = PWGGenerator(**pwg_config["generator_params"])
    vocoder.set_state_dict(paddle.load(args.pwg_params))
    vocoder.remove_weight_norm()
    vocoder.eval()
    print("model done!")
    stat = np.load(args.speedyspeech_stat)
    mu, std = stat
    mu = paddle.to_tensor(mu)
    std = paddle.to_tensor(std)
    speedyspeech_normalizer = ZScore(mu, std)
    stat = np.load(args.pwg_stat)
    mu, std = stat
    mu = paddle.to_tensor(mu)
    std = paddle.to_tensor(std)
    pwg_normalizer = ZScore(mu, std)
    speedyspeech_inferencce = SpeedySpeechInference(speedyspeech_normalizer,
                                                    model)
    pwg_inference = PWGInference(pwg_normalizer, vocoder)
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    for datum in test_dataset:
        utt_id = datum["utt_id"]
        phones = paddle.to_tensor(datum["phones"])
        tones = paddle.to_tensor(datum["tones"])
        with paddle.no_grad():
            wav = pwg_inference(speedyspeech_inferencce(phones, tones))
        sf.write(
            output_dir / (utt_id + ".wav"),
            wav.numpy(),
            samplerate=speedyspeech_config.sr)
        print(f"{utt_id} done!")
 def main():
    # parse args and config and redirect to train_sp
    parser = argparse.ArgumentParser(
        description="Synthesize with speedyspeech & parallel wavegan.")
    parser.add_argument(
        "--speedyspeech-config",
        type=str,
        help="config file to overwrite default config")
    parser.add_argument(
        "--speedyspeech-checkpoint",
        type=str,
        help="speedyspeech checkpoint to load.")
    parser.add_argument(
        "--speedyspeech-stat",
        type=str,
        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
    )
    parser.add_argument(
        "--pwg-config",
        type=str,
        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
    )
    parser.add_argument(
        "--pwg-params",
        type=str,
        help="parallel wavegan generator parameters to load.")
    parser.add_argument(
        "--pwg-stat",
        type=str,
        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
    )
    parser.add_argument("--test-metadata", type=str, help="test metadata")
    parser.add_argument("--output-dir", type=str, help="output dir")
    parser.add_argument(
        "--device", type=str, default="gpu", help="device type to use")
    parser.add_argument("--verbose", type=int, default=1, help="verbose")
    args = parser.parse_args()
    with open(args.speedyspeech_config) as f:
        speedyspeech_config = CfgNode(yaml.safe_load(f))
    with open(args.pwg_config) as f:
        pwg_config = CfgNode(yaml.safe_load(f))
    print("========Args========")
    print(yaml.safe_dump(vars(args)))
    print("========Config========")
    print(speedyspeech_config)
    print(pwg_config)
    evaluate(args, speedyspeech_config, pwg_config)
 if __name__ == "__main__":
    main()
--- a/examples/speedyspeech/baker/synthesize.sh
+++ b/examples/speedyspeech/baker/synthesize.sh
@ -0,0 +1,10 @@
 python synthesize.py \
  --speedyspeech-config=conf/default.yaml \
  --speedyspeech-checkpoint=exp/debug/checkpoints/snapshot_iter_91800.pdz \
  --speedyspeech-stat=dump/train/stats.npy \
  --pwg-config=../../parallelwave_gan/baker/conf/default.yaml \
  --pwg-params=../../parallelwave_gan/baker/converted.pdparams \
  --pwg-stat=../../parallelwave_gan/baker/dump/train/stats.npy \
  --test-metadata=dump/test/norm/metadata.jsonl \
  --output-dir=exp/debug/test \
  --device="gpu"
--- a/examples/speedyspeech/baker/synthesize_e2e.py
+++ b/examples/speedyspeech/baker/synthesize_e2e.py
@ -0,0 +1,150 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import sys
 import logging
 import argparse
 import dataclasses
 from pathlib import Path
 import yaml
 import jsonlines
 import paddle
 import numpy as np
 import soundfile as sf
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from paddle import distributed as dist
 from yacs.config import CfgNode
 from parakeet.datasets.data_table import DataTable
 from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference
 from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
 from parakeet.modules.normalizer import ZScore
 from frontend import text_analysis
 def evaluate(args, speedyspeech_config, pwg_config):
    # dataloader has been too verbose
    logging.getLogger("DataLoader").disabled = True
    # construct dataset for evaluation
    sentences = []
    with open(args.text, 'rt') as f:
        for line in f:
            utt_id, sentence = line.strip().split()
            sentences.append((utt_id, sentence))
    model = SpeedySpeech(**speedyspeech_config["model"])
    model.set_state_dict(
        paddle.load(args.speedyspeech_checkpoint)["main_params"])
    model.eval()
    vocoder = PWGGenerator(**pwg_config["generator_params"])
    vocoder.set_state_dict(paddle.load(args.pwg_params))
    vocoder.remove_weight_norm()
    vocoder.eval()
    print("model done!")
    stat = np.load(args.speedyspeech_stat)
    mu, std = stat
    mu = paddle.to_tensor(mu)
    std = paddle.to_tensor(std)
    speedyspeech_normalizer = ZScore(mu, std)
    stat = np.load(args.pwg_stat)
    mu, std = stat
    mu = paddle.to_tensor(mu)
    std = paddle.to_tensor(std)
    pwg_normalizer = ZScore(mu, std)
    speedyspeech_inferencce = SpeedySpeechInference(speedyspeech_normalizer,
                                                    model)
    pwg_inference = PWGInference(pwg_normalizer, vocoder)
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    for utt_id, sentence in sentences:
        phones, tones = text_analysis(sentence)
        with paddle.no_grad():
            wav = pwg_inference(speedyspeech_inferencce(phones, tones))
        sf.write(
            output_dir / (utt_id + ".wav"),
            wav.numpy(),
            samplerate=speedyspeech_config.sr)
        print(f"{utt_id} done!")
 def main():
    # parse args and config and redirect to train_sp
    parser = argparse.ArgumentParser(
        description="Synthesize with speedyspeech & parallel wavegan.")
    parser.add_argument(
        "--speedyspeech-config",
        type=str,
        help="config file to overwrite default config")
    parser.add_argument(
        "--speedyspeech-checkpoint",
        type=str,
        help="speedyspeech checkpoint to load.")
    parser.add_argument(
        "--speedyspeech-stat",
        type=str,
        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
    )
    parser.add_argument(
        "--pwg-config",
        type=str,
        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
    )
    parser.add_argument(
        "--pwg-params",
        type=str,
        help="parallel wavegan generator parameters to load.")
    parser.add_argument(
        "--pwg-stat",
        type=str,
        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
    )
    parser.add_argument(
        "--text",
        type=str,
        help="text to synthesize, a 'utt_id sentence' pair per line")
    parser.add_argument("--output-dir", type=str, help="output dir")
    parser.add_argument(
        "--device", type=str, default="gpu", help="device type to use")
    parser.add_argument("--verbose", type=int, default=1, help="verbose")
    args = parser.parse_args()
    with open(args.speedyspeech_config) as f:
        speedyspeech_config = CfgNode(yaml.safe_load(f))
    with open(args.pwg_config) as f:
        pwg_config = CfgNode(yaml.safe_load(f))
    print("========Args========")
    print(yaml.safe_dump(vars(args)))
    print("========Config========")
    print(speedyspeech_config)
    print(pwg_config)
    evaluate(args, speedyspeech_config, pwg_config)
 if __name__ == "__main__":
    main()
--- a/examples/speedyspeech/baker/synthesize_e2e.sh
+++ b/examples/speedyspeech/baker/synthesize_e2e.sh
@ -0,0 +1,10 @@
 python synthesize_e2e.py \
  --speedyspeech-config=conf/default.yaml \
  --speedyspeech-checkpoint=exp/debug/checkpoints/snapshot_iter_91800.pdz \
  --speedyspeech-stat=dump/train/stats.npy \
  --pwg-config=../../parallelwave_gan/baker/conf/default.yaml \
  --pwg-params=../../parallelwave_gan/baker/converted.pdparams \
  --pwg-stat=../../parallelwave_gan/baker/dump/train/stats.npy \
  --text=sentences.txt \
  --output-dir=exp/e2e \
  --device="gpu"
--- a/examples/speedyspeech/baker/tg_utils.py
+++ b/examples/speedyspeech/baker/tg_utils.py
@ -0,0 +1,27 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import librosa
 from praatio import tgio
 def validate_textgrid(text_grid, num_samples, sr):
    """Validate Text Grid to make sure that the time interval annotated 
    by the tex grid file does not go beyond the audio file.
    """
    start = text_grid.minTimestamp
    end = text_grid.maxTimestamp
    end_audio = librosa.samples_to_time(num_samples, sr)
    return start == 0.0 and end <= end_audio
--- a/examples/speedyspeech/baker/tones.txt
+++ b/examples/speedyspeech/baker/tones.txt
@ -0,0 +1,6 @@
 0
 1
 2
 3
 4
 5
--- a/examples/speedyspeech/baker/train.py
+++ b/examples/speedyspeech/baker/train.py
@ -0,0 +1,186 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import sys
 import logging
 import argparse
 import dataclasses
 from pathlib import Path
 import yaml
 import jsonlines
 import paddle
 import numpy as np
 from paddle import nn
 from paddle.nn import functional as F
 from paddle import distributed as dist
 from paddle.io import DataLoader, DistributedBatchSampler
 from paddle.optimizer import Adam  # No RAdaom
 from paddle.optimizer.lr import StepDecay
 from paddle import DataParallel
 from visualdl import LogWriter
 from parakeet.datasets.data_table import DataTable
 from parakeet.models.speedyspeech import SpeedySpeech
 from parakeet.training.updater import UpdaterBase
 from parakeet.training.trainer import Trainer
 from parakeet.training.reporter import report
 from parakeet.training import extension
 from parakeet.training.extensions.snapshot import Snapshot
 from parakeet.training.extensions.visualizer import VisualDL
 from parakeet.training.seeding import seed_everything
 from batch_fn import collate_baker_examples
 from speedyspeech_updater import SpeedySpeechUpdater, SpeedySpeechEvaluator
 from config import get_cfg_default
 def train_sp(args, config):
    # decides device type and whether to run in parallel
    # setup running environment correctly
    if not paddle.is_compiled_with_cuda:
        paddle.set_device("cpu")
    else:
        paddle.set_device("gpu")
        world_size = paddle.distributed.get_world_size()
        if world_size > 1:
            paddle.distributed.init_parallel_env()
    # set the random seed, it is a must for multiprocess training
    seed_everything(config.seed)
    print(
        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
    )
    # dataloader has been too verbose
    logging.getLogger("DataLoader").disabled = True
    # construct dataset for training and validation
    with jsonlines.open(args.train_metadata, 'r') as reader:
        train_metadata = list(reader)
    train_dataset = DataTable(
        data=train_metadata,
        fields=[
            "phones", "tones", "num_phones", "num_frames", "feats", "durations"
        ],
        converters={"feats": np.load, }, )
    with jsonlines.open(args.dev_metadata, 'r') as reader:
        dev_metadata = list(reader)
    dev_dataset = DataTable(
        data=dev_metadata,
        fields=[
            "phones", "tones", "num_phones", "num_frames", "feats", "durations"
        ],
        converters={"feats": np.load, }, )
    # collate function and dataloader
    train_sampler = DistributedBatchSampler(
        train_dataset,
        batch_size=config.batch_size,
        shuffle=False,
        drop_last=True)
    # dev_sampler = DistributedBatchSampler(dev_dataset,
    #                                       batch_size=config.batch_size,
    #                                       shuffle=False,
    #                                       drop_last=False)
    print("samplers done!")
    train_dataloader = DataLoader(
        train_dataset,
        batch_sampler=train_sampler,
        collate_fn=collate_baker_examples,
        num_workers=config.num_workers)
    dev_dataloader = DataLoader(
        dev_dataset,
        shuffle=False,
        drop_last=False,
        batch_size=config.batch_size,
        collate_fn=collate_baker_examples,
        num_workers=config.num_workers)
    print("dataloaders done!")
    # batch = collate_baker_examples([train_dataset[i] for i in range(10)])
    # # batch = collate_baker_examples([dev_dataset[i] for i in range(10)])
    # import pdb; pdb.set_trace()
    model = SpeedySpeech(**config["model"])
    if world_size > 1:
        model = DataParallel(model)  # TODO, do not use vocab size from config
    # print(model)
    print("model done!")
    optimizer = Adam(
        0.001,
        parameters=model.parameters(),
        grad_clip=nn.ClipGradByGlobalNorm(5.0))
    print("optimizer done!")
    updater = SpeedySpeechUpdater(
        model=model, optimizer=optimizer, dataloader=train_dataloader)
    output_dir = Path(args.output_dir)
    trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
    evaluator = SpeedySpeechEvaluator(model, dev_dataloader)
    if dist.get_rank() == 0:
        trainer.extend(evaluator, trigger=(1, "epoch"))
        writer = LogWriter(str(output_dir))
        trainer.extend(VisualDL(writer), trigger=(1, "iteration"))
        trainer.extend(
            Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
    print(trainer.extensions)
    trainer.run()
 def main():
    # parse args and config and redirect to train_sp
    parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN "
                                     "model with Baker Mandrin TTS dataset.")
    parser.add_argument(
        "--config", type=str, help="config file to overwrite default config")
    parser.add_argument("--train-metadata", type=str, help="training data")
    parser.add_argument("--dev-metadata", type=str, help="dev data")
    parser.add_argument("--output-dir", type=str, help="output dir")
    parser.add_argument(
        "--device", type=str, default="gpu", help="device type to use")
    parser.add_argument(
        "--nprocs", type=int, default=1, help="number of processes")
    parser.add_argument("--verbose", type=int, default=1, help="verbose")
    args = parser.parse_args()
    if args.device == "cpu" and args.nprocs > 1:
        raise RuntimeError("Multiprocess training on CPU is not supported.")
    config = get_cfg_default()
    if args.config:
        config.merge_from_file(args.config)
    print("========Args========")
    print(yaml.safe_dump(vars(args)))
    print("========Config========")
    print(config)
    print(
        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
    )
    # dispatch
    if args.nprocs > 1:
        dist.spawn(train_sp, (args, config), nprocs=args.nprocs)
    else:
        train_sp(args, config)
 if __name__ == "__main__":
    main()
--- a/parakeet/data/batch.py
+++ b/parakeet/data/batch.py
@ -161,3 +161,27 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
                       mode='constant',
                       constant_values=pad_value))
    return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
 def batch_sequences(sequences, axis=0, pad_value=0):
    # import pdb; pdb.set_trace()
    seq = sequences[0]
    ndim = seq.ndim
    if axis < 0:
        axis += ndim
    dtype = seq.dtype
    pad_value = dtype.type(pad_value)
    seq_lengths = [seq.shape[axis] for seq in sequences]
    max_length = np.max(seq_lengths)
    padded_sequences = []
    for seq, length in zip(sequences, seq_lengths):
        padding = [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (
            ndim - axis - 1)
        padded_seq = np.pad(seq,
                            padding,
                            mode='constant',
                            constant_values=pad_value)
        padded_sequences.append(padded_seq)
    batch = np.stack(padded_sequences)
    return batch
--- a/parakeet/models/parallel_wavegan.py
+++ b/parakeet/models/parallel_wavegan.py
@ -768,3 +768,15 @@ class ResidualPWGDiscriminator(nn.Layer):
                pass
        self.apply(_remove_weight_norm)
 class PWGInference(nn.Layer):
    def __init__(self, normalizer, pwg_generator):
        super().__init__()
        self.normalizer = normalizer
        self.pwg_generator = pwg_generator
    def forward(self, logmel):
        normalized_mel = self.normalizer(logmel)
        wav = self.pwg_generator.inference(normalized_mel)
        return wav
--- a/parakeet/models/speedyspeech.py
+++ b/parakeet/models/speedyspeech.py
@ -0,0 +1,226 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import numpy as np
 import paddle
 from paddle import Tensor
 from paddle import nn
 from paddle.nn import functional as F
 from paddle.nn import initializer as I
 from parakeet.modules.positional_encoding import sinusoid_position_encoding
 from parakeet.modules.expansion import expand
 class ResidualBlock(nn.Layer):
    def __init__(self, channels, kernel_size, dilation, n=2):
        super().__init__()
        blocks = [
            nn.Sequential(
                nn.Conv1D(
                    channels,
                    channels,
                    kernel_size,
                    dilation=dilation,
                    padding="same",
                    data_format="NLC"),
                nn.ReLU(),
                nn.BatchNorm1D(
                    channels, data_format="NLC"), ) for _ in range(n)
        ]
        self.blocks = nn.Sequential(*blocks)
    def forward(self, x):
        return x + self.blocks(x)
 class TextEmbedding(nn.Layer):
    def __init__(self,
                 vocab_size: int,
                 embedding_size: int,
                 tone_vocab_size: int=None,
                 tone_embedding_size: int=None,
                 padding_idx: int=None,
                 tone_padding_idx: int=None,
                 concat: bool=False):
        super().__init__()
        self.text_embedding = nn.Embedding(vocab_size, embedding_size,
                                           padding_idx)
        if tone_vocab_size:
            tone_embedding_size = tone_embedding_size or embedding_size
            if tone_embedding_size != embedding_size and not concat:
                raise ValueError(
                    "embedding size != tone_embedding size, only conat is avaiable."
                )
            self.tone_embedding = nn.Embedding(
                tone_vocab_size, tone_embedding_size, tone_padding_idx)
        self.concat = concat
    def forward(self, text, tone=None):
        text_embed = self.text_embedding(text)
        if tone is None:
            return text_embed
        tone_embed = self.tone_embedding(tone)
        if self.concat:
            embed = paddle.concat([text_embed, tone_embed], -1)
        else:
            embed = text_embed + tone_embed
        return embed
 class SpeedySpeechEncoder(nn.Layer):
    def __init__(self, vocab_size, tone_size, hidden_size, kernel_size,
                 dilations):
        super().__init__()
        self.embedding = TextEmbedding(
            vocab_size,
            hidden_size,
            tone_size,
            padding_idx=0,
            tone_padding_idx=0)
        self.prenet = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(), )
        res_blocks = [
            ResidualBlock(
                hidden_size, kernel_size, d, n=2) for d in dilations
        ]
        self.res_blocks = nn.Sequential(*res_blocks)
        self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
        self.postnet2 = nn.Sequential(
            nn.ReLU(),
            nn.BatchNorm1D(
                hidden_size, data_format="NLC"),
            nn.Linear(hidden_size, hidden_size), )
    def forward(self, text, tones):
        embedding = self.embedding(text, tones)
        embedding = self.prenet(embedding)
        x = self.res_blocks(embedding)
        x = embedding + self.postnet1(x)
        x = self.postnet2(x)
        return x
 class DurationPredictor(nn.Layer):
    def __init__(self, hidden_size):
        super().__init__()
        self.layers = nn.Sequential(
            ResidualBlock(
                hidden_size, 4, 1, n=1),
            ResidualBlock(
                hidden_size, 3, 1, n=1),
            ResidualBlock(
                hidden_size, 1, 1, n=1),
            nn.Linear(hidden_size, 1))
    def forward(self, x):
        return paddle.squeeze(self.layers(x), -1)
 class SpeedySpeechDecoder(nn.Layer):
    def __init__(self, hidden_size, output_size, kernel_size, dilations):
        super().__init__()
        res_blocks = [
            ResidualBlock(
                hidden_size, kernel_size, d, n=2) for d in dilations
        ]
        self.res_blocks = nn.Sequential(*res_blocks)
        self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
        self.postnet2 = nn.Sequential(
            ResidualBlock(
                hidden_size, kernel_size, 1, n=2),
            nn.Linear(hidden_size, output_size))
    def forward(self, x):
        xx = self.res_blocks(x)
        x = x + self.postnet1(xx)
        x = self.postnet2(x)
        return x
 class SpeedySpeech(nn.Layer):
    def __init__(
            self,
            vocab_size,
            encoder_hidden_size,
            encoder_kernel_size,
            encoder_dilations,
            duration_predictor_hidden_size,
            decoder_hidden_size,
            decoder_output_size,
            decoder_kernel_size,
            decoder_dilations,
            tone_size=None, ):
        super().__init__()
        encoder = SpeedySpeechEncoder(vocab_size, tone_size,
                                      encoder_hidden_size, encoder_kernel_size,
                                      encoder_dilations)
        duration_predictor = DurationPredictor(duration_predictor_hidden_size)
        decoder = SpeedySpeechDecoder(decoder_hidden_size, decoder_output_size,
                                      decoder_kernel_size, decoder_dilations)
        self.encoder = encoder
        self.duration_predictor = duration_predictor
        self.decoder = decoder
    def forward(self, text, tones, plens, durations):
        encodings = self.encoder(text, tones)
        pred_durations = self.duration_predictor(encodings.detach())  # (B, T)
        # expand encodings
        durations_to_expand = durations
        encodings = expand(encodings, durations_to_expand)
        # decode
        # remove positional encoding here
        _, t_dec, feature_size = encodings.shape
        encodings += sinusoid_position_encoding(t_dec, feature_size)
        decoded = self.decoder(encodings)
        return decoded, pred_durations
    def inference(self, text, tones):
        # text: [T]
        # tones: [T]
        text = text.unsqueeze(0)
        if tones is not None:
            tones = tones.unsqueeze(0)
        encodings = self.encoder(text, tones)
        pred_durations = self.duration_predictor(encodings)  # (1, T)
        durations_to_expand = paddle.round(pred_durations.exp())
        durations_to_expand = (durations_to_expand).astype(paddle.int64)
        encodings = expand(encodings, durations_to_expand)
        shape = paddle.shape(encodings)
        t_dec, feature_size = shape[1], shape[2]
        encodings += sinusoid_position_encoding(t_dec, feature_size)
        decoded = self.decoder(encodings)
        return decoded[0]
 class SpeedySpeechInference(nn.Layer):
    def __init__(self, normalizer, speedyspeech_model):
        super().__init__()
        self.normalizer = normalizer
        self.acoustic_model = speedyspeech_model
    def forward(self, phones, tones):
        normalized_mel = self.acoustic_model.inference(phones, tones)
        logmel = self.normalizer.inverse(normalized_mel)
        return logmel
--- a/parakeet/models/transformer_tts.py
+++ b/parakeet/models/transformer_tts.py
@ -403,7 +403,7 @@ class TransformerTTS(nn.Layer):
        else:
            self.toned = False
        # position encoding matrix may be extended later
-        self.encoder_pe = pe.sinusoid_positional_encoding(0, 1000, d_encoder)
+        self.encoder_pe = pe.sinusoid_positional_encoding(1000, d_encoder)
        self.encoder_pe_scalar = self.create_parameter(
            [1], attr=I.Constant(1.))
        self.encoder = TransformerEncoder(d_encoder, n_heads, d_ffn,
@ -411,7 +411,7 @@ class TransformerTTS(nn.Layer):
        # decoder
        self.decoder_prenet = MLPPreNet(d_mel, d_prenet, d_decoder, dropout)
-        self.decoder_pe = pe.sinusoid_positional_encoding(0, 1000, d_decoder)
+        self.decoder_pe = pe.sinusoid_positional_encoding(1000, d_decoder)
        self.decoder_pe_scalar = self.create_parameter(
            [1], attr=I.Constant(1.))
        self.decoder = TransformerDecoder(
@ -488,7 +488,7 @@ class TransformerTTS(nn.Layer):
        # twice its length if needed
        if x.shape[1] * self.r > self.decoder_pe.shape[0]:
            new_T = max(x.shape[1] * self.r, self.decoder_pe.shape[0] * 2)
-            self.decoder_pe = pe.sinusoid_positional_encoding(0, new_T,
+            self.decoder_pe = pe.sinusoid_positional_encoding(new_T,
                                                              self.d_decoder)
        pos_enc = self.decoder_pe[:T_dec * self.r:self.r, :]
        x = x.scale(math.sqrt(
--- a/parakeet/modules/expansion.py
+++ b/parakeet/modules/expansion.py
@ -0,0 +1,39 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 import paddle
 from paddle import Tensor
 def expand(encodings: Tensor, durations: Tensor) -> Tensor:
    """
    encodings: (B, T, C)
    durations: (B, T)
    """
    batch_size, t_enc = durations.shape
    durations = durations.numpy()
    slens = np.sum(durations, -1)
    t_dec = np.max(slens)
    M = np.zeros([batch_size, t_dec, t_enc])
    for i in range(batch_size):
        k = 0
        for j in range(t_enc):
            d = durations[i, j]
            M[i, k:k + d, j] = 1
            k += d
    M = paddle.to_tensor(M, dtype=encodings.dtype)
    encodings = paddle.matmul(M, encodings)
    return encodings
--- a/parakeet/modules/normalizer.py
+++ b/parakeet/modules/normalizer.py
@ -0,0 +1,29 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle import nn
 class ZScore(nn.Layer):
    # feature last
    def __init__(self, mu, sigma):
        super().__init__()
        self.register_buffer("mu", mu)
        self.register_buffer("sigma", sigma)
    def forward(self, x):
        return (x - self.mu) / self.sigma
    def inverse(self, x):
        return x * self.sigma + self.mu
--- a/parakeet/modules/positional_encoding.py
+++ b/parakeet/modules/positional_encoding.py
@ -14,47 +14,56 @@
 import math
 import numpy as np
 import paddle
 from paddle import Tensor
 from paddle.nn import functional as F
-__all__ = ["sinusoid_positional_encoding"]
+__all__ = ["sinusoid_position_encoding", "scaled_position_encoding"]
-def sinusoid_positional_encoding(start_index, length, size, dtype=None):
+def sinusoid_position_encoding(num_positions: int,
-    r"""Generate standard positional encoding matrix.
+                               feature_size: int,
-    
+                               omega: float=1.0,
-    .. math::
+                               start_pos: int=0,
-    
+                               dtype=None) -> Tensor:
-        pe(pos, 2i) = sin(\frac{pos}{10000^{\frac{2i}{size}}}) \\
+    # return tensor shape (num_positions, feature_size)
-        pe(pos, 2i+1) = cos(\frac{pos}{10000^{\frac{2i}{size}}})
+    if (feature_size % 2 != 0):
    Parameters
    ----------
    start_index : int
        The start index.
    length : int
        The timesteps of the positional encoding to generate.
    size : int 
        Feature size of positional encoding.
    Returns
    -------
    Tensor [shape=(length, size)]
        The positional encoding.
    Raises
    ------
    ValueError
        If ``size`` is not divisible by 2.
    """
    if (size % 2 != 0):
        raise ValueError("size should be divisible by 2")
    dtype = dtype or paddle.get_default_dtype()
-    channel = np.arange(0, size, 2)
+
-    index = np.arange(start_index, start_index + length, 1)
+    channel = paddle.arange(0, feature_size, 2, dtype=dtype)
-    p = np.expand_dims(index, -1) / (10000**(channel / float(size)))
+    index = paddle.arange(start_pos, start_pos + num_positions, 1, dtype=dtype)
-    encodings = np.zeros([length, size])
+    p = (paddle.unsqueeze(index, -1) *
-    encodings[:, 0::2] = np.sin(p)
+         omega) / (10000.0**(channel / float(feature_size)))
-    encodings[:, 1::2] = np.cos(p)
+    encodings = paddle.zeros([num_positions, feature_size], dtype=dtype)
-    encodings = paddle.to_tensor(encodings)
+    encodings[:, 0::2] = paddle.sin(p)
    encodings[:, 1::2] = paddle.cos(p)
    return encodings
 def scaled_position_encoding(num_positions: int,
                             feature_size: int,
                             omega: Tensor,
                             start_pos: int=0,
                             dtype=None) -> Tensor:
    # omega: Tensor (batch_size, )
    # return tensor shape (batch_size, num_positions, feature_size)
    # consider renaming this as batched positioning encoding
    if (feature_size % 2 != 0):
        raise ValueError("size should be divisible by 2")
    dtype = dtype or paddle.get_default_dtype()
    channel = paddle.arange(0, feature_size, 2, dtype=dtype)
    index = paddle.arange(
        start_pos, start_pos + num_positions, 1, dtype=omega.dtype)
    batch_size = omega.shape[0]
    omega = paddle.unsqueeze(omega, [1, 2])
    p = (paddle.unsqueeze(index, -1) *
         omega) / (10000.0**(channel / float(feature_size)))
    encodings = paddle.zeros(
        [batch_size, num_positions, feature_size], dtype=dtype)
    # it is nice to have fancy indexing and inplace operations
    encodings[:, :, 0::2] = paddle.sin(p)
    encodings[:, :, 1::2] = paddle.cos(p)
    return encodings
--- a/parakeet/modules/ssim.py
+++ b/parakeet/modules/ssim.py
@ -0,0 +1,84 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from math import exp
 import numpy as np
 import paddle
 from paddle import nn
 import paddle.nn.functional as F
 def gaussian(window_size, sigma):
    gauss = paddle.to_tensor([
        exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
        for x in range(window_size)
    ])
    return gauss / gauss.sum()
 def create_window(window_size, channel):
    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
    _2D_window = paddle.matmul(_1D_window,
                               paddle.transpose(_1D_window,
                                                [1, 0])).unsqueeze([0, 1])
    window = paddle.expand(_2D_window, [channel, 1, window_size, window_size])
    return window
 def _ssim(img1, img2, window, window_size, channel, size_average=True):
    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
    mu1_sq = mu1.pow(2)
    mu2_sq = mu2.pow(2)
    mu1_mu2 = mu1 * mu2
    sigma1_sq = F.conv2d(
        img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
    sigma2_sq = F.conv2d(
        img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
    sigma12 = F.conv2d(
        img1 * img2, window, padding=window_size // 2,
        groups=channel) - mu1_mu2
    C1 = 0.01**2
    C2 = 0.03**2
    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) \
             / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
    if size_average:
        return ssim_map.mean()
    else:
        return ssim_map.mean(1).mean(1).mean(1)
 class SSIM(nn.Layer):
    def __init__(self, window_size=11, size_average=True):
        super().__init__()
        self.window_size = window_size
        self.size_average = size_average
        self.channel = 1
        self.window = create_window(window_size, self.channel)
    def forward(self, img1, img2):
        return _ssim(img1, img2, self.window, self.window_size, self.channel,
                     self.size_average)
 def ssim(img1, img2, window_size=11, size_average=True):
    (_, channel, _, _) = img1.shape
    window = create_window(window_size, channel)
    return _ssim(img1, img2, window, window_size, channel, size_average)
--- a/parakeet/training/trainer.py
+++ b/parakeet/training/trainer.py
@ -123,8 +123,6 @@ class Trainer(object):
        update = self.updater.update  # training step
        stop_trigger = self.stop_trigger
        print(self.updater.state)
        # display only one progress bar
        max_iteration = None
        if isinstance(stop_trigger, LimitTrigger):
--- a/parakeet/training/triggers/interval_trigger.py
+++ b/parakeet/training/triggers/interval_trigger.py
@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from copy import deepcopy
 class IntervalTrigger(object):
    """A Predicate to do something every N cycle."""
@ -23,9 +25,16 @@ class IntervalTrigger(object):
            raise ValueError("period should be a positive integer.")
        self.period = period
        self.unit = unit
        self.last_index = None
    def __call__(self, trainer):
-        state = trainer.updater.state
+        if self.last_index is None:
-        index = getattr(state, self.unit)
+            last_index = getattr(trainer.updater.state, self.unit)
-        fire = index % self.period == 0
+            self.last_index = last_index
        last_index = self.last_index
        index = getattr(trainer.updater.state, self.unit)
        fire = index // self.period != last_index // self.period
        self.last_index = index
        return fire
--- a/parakeet/training/updaters/standard_updater.py
+++ b/parakeet/training/updaters/standard_updater.py
@ -106,8 +106,8 @@ class StandardUpdater(UpdaterBase):
        self.update_core(batch)
        self.state.iteration += 1
-        if self.updaters_per_epoch is not None:
+        if self.updates_per_epoch is not None:
-            if self.state.iteration % self.updaters_per_epoch == 0:
+            if self.state.iteration % self.updates_per_epoch == 0:
                self.state.epoch += 1
    def update_core(self, batch):
@ -139,7 +139,7 @@ class StandardUpdater(UpdaterBase):
        self.optimizer.update()
    @property
-    def updaters_per_epoch(self):
+    def updates_per_epoch(self):
        """Number of updater per epoch, determined by the length of the 
        dataloader."""
        length_of_dataloader = None
--- a/tests/test_expansion.py
+++ b/tests/test_expansion.py
@ -0,0 +1,29 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 from parakeet.modules import expansion
 def test_expand():
    x = paddle.randn([2, 4, 3])  # (B, T, C)
    lengths = paddle.to_tensor([[1, 2, 2, 1], [3, 1, 4, 0]])
    y = expansion.expand(x, lengths)
    assert y.shape == [2, 8, 3]
    print("the first sequence")
    print(y[0])
    print("the second sequence")
    print(y[1])
--- a/tests/test_to_static.py
+++ b/tests/test_to_static.py
@ -0,0 +1,34 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import paddle
 from paddle.jit import to_static
 from paddle.static import InputSpec
 def test_applicative_evaluation():
    def m_sqrt2(x):
        return paddle.scale(x, math.sqrt(2))
    subgraph = to_static(m_sqrt2, input_spec=[InputSpec([-1])])
    paddle.jit.save(subgraph, './temp_test_to_static')
    fn = paddle.jit.load('./temp_test_to_static')
    x = paddle.arange(10, dtype=paddle.float32)
    y = fn(x)
    print(x)
    print(y)