Merge pull request #129 from iclementine/speedyspeech

add speedyspeech model and example with baker dataset.
2021-07-19 03:32:51 -05:00 · 2021-07-19 03:32:51 -05:00 · 25788ab2ca
parent 124dedbd7b 4ba8e7e342
commit 25788ab2ca
36 changed files with 2125 additions and 59 deletions
--- a/examples/parallelwave_gan/baker/conf/default.yaml
+++ b/examples/parallelwave_gan/baker/conf/default.yaml
@ -7,14 +7,14 @@
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
 sr: 24000                # Sampling rate.
-n_fft: 2048           # FFT size.
-hop_length: 300            # Hop size.
-win_length: 1200         # Window length.
+n_fft: 2048              # FFT size (in samples).
+hop_length: 300          # Hop size (in samples).
+win_length: 1200         # Window length (in samples).
                         # If set to null, it will be the same as fft_size.
 window: "hann"           # Window function.
 n_mels: 80               # Number of mel basis.
-fmin: 80                 # Minimum freq in mel basis calculation.
-fmax: 7600               # Maximum frequency in mel basis calculation.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
 # global_gain_scale: 1.0   # Will be multiplied to all of waveform.
 trim_silence: false      # Whether to trim the start and end of silence.
 top_db: 60               # Need to tune carefully if the recording is not good.
--- a/examples/parallelwave_gan/baker/preprocess.py
+++ b/examples/parallelwave_gan/baker/preprocess.py
@ -202,14 +202,12 @@ def process_sentences(config,
 def main():
    # parse config and args
    parser = argparse.ArgumentParser(
-        description="Preprocess audio and then extract features (See detail in parallel_wavegan/bin/preprocess.py)."
-    )
+        description="Preprocess audio and then extract features .")
    parser.add_argument(
        "--rootdir",
        default=None,
        type=str,
-        help="directory including wav files. you need to specify either scp or rootdir."
-    )
+        help="directory to baker dataset.")
    parser.add_argument(
        "--dumpdir",
        type=str,
--- a/examples/parallelwave_gan/baker/preprocess.sh
+++ b/examples/parallelwave_gan/baker/preprocess.sh
@ -0,0 +1,6 @@
+python preprocess.py --rootdir=~/datasets/BZNSYP/ --dumpdir=dump --num_cpu=20
+python compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" --dumpdir=dump/train
+
+python normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/stats.npy
+python normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/stats.npy
+python normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/stats.npy
--- a/examples/parallelwave_gan/baker/run.sh
+++ b/examples/parallelwave_gan/baker/run.sh
@ -0,0 +1,9 @@
+FLAGS_cudnn_exhaustive_search=true \
+FLAGS_conv_workspace_size_limit=4000 \
+
+python train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=conf/default.yaml \
+    --output-dir=exp/default \
+    --nprocs=1
--- a/examples/speedyspeech/baker/batch_fn.py
+++ b/examples/speedyspeech/baker/batch_fn.py
@ -0,0 +1,43 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from parakeet.data.batch import batch_sequences
+
+
+def collate_baker_examples(examples):
+    # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"]
+    phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
+    tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
+    feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
+    durations = [
+        np.array(
+            item["durations"], dtype=np.int64) for item in examples
+    ]
+    num_phones = np.array([item["num_phones"] for item in examples])
+    num_frames = np.array([item["num_frames"] for item in examples])
+
+    phones = batch_sequences(phones)
+    tones = batch_sequences(tones)
+    feats = batch_sequences(feats)
+    durations = batch_sequences(durations)
+    batch = {
+        "phones": phones,
+        "tones": tones,
+        "num_phones": num_phones,
+        "num_frames": num_frames,
+        "feats": feats,
+        "durations": durations,
+    }
+    return batch
--- a/examples/speedyspeech/baker/compute_statistics.py
+++ b/examples/speedyspeech/baker/compute_statistics.py
@ -0,0 +1,109 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Calculate statistics of feature files."""
+
+import argparse
+import logging
+import os
+from pathlib import Path
+
+import numpy as np
+import yaml
+import json
+import jsonlines
+
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+from parakeet.datasets.data_table import DataTable
+from parakeet.utils.h5_utils import read_hdf5
+from parakeet.utils.h5_utils import write_hdf5
+
+from config import get_cfg_default
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Compute mean and variance of dumped raw features.")
+    parser.add_argument(
+        "--metadata", type=str, help="json file with id and file paths ")
+    parser.add_argument(
+        "--field-name",
+        type=str,
+        help="name of the field to compute statistics for.")
+    parser.add_argument(
+        "--config", type=str, help="yaml format configuration file.")
+    parser.add_argument(
+        "--output",
+        type=str,
+        help="path to save statistics. if not provided, "
+        "stats will be saved in the above root directory with name stats.npy")
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)")
+    args = parser.parse_args()
+
+    # set logger
+    if args.verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+    elif args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+        logging.warning('Skip DEBUG/INFO messages')
+
+    config = get_cfg_default()
+    # load config
+    if args.config:
+        config.merge_from_file(args.config)
+
+    # check directory existence
+    if args.output is None:
+        args.output = Path(args.metadata).parent.with_name("stats.npy")
+    else:
+        args.output = Path(args.output)
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata,
+        fields=[args.field_name],
+        converters={args.field_name: np.load}, )
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    # calculate statistics
+    scaler = StandardScaler()
+    for datum in tqdm(dataset):
+        # StandardScalar supports (*, num_features) by default
+        scaler.partial_fit(datum[args.field_name])
+
+    stats = np.stack([scaler.mean_, scaler.scale_], axis=0)
+    np.save(str(args.output), stats.astype(np.float32), allow_pickle=False)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/speedyspeech/baker/conf/default.yaml
+++ b/examples/speedyspeech/baker/conf/default.yaml
@ -0,0 +1,60 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sr: 24000     # Sampling rate.
+n_fft: 2048           # FFT size.
+hop_length: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+# global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+top_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_length: 2048    # Frame size in trimming.(in samples)
+trim_hop_length: 512       # Hop size in trimming.(in samples)
+
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 32
+num_workers: 4
+
+
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+  vocab_size: 101 # 99 + 2
+  tone_size: 8 # 6 + 2
+  encoder_hidden_size: 128
+  encoder_kernel_size: 3
+  encoder_dilations: [1, 3, 9, 27, 1, 3, 9, 27, 1, 1]
+  duration_predictor_hidden_size: 128
+  decoder_hidden_size: 128
+  decoder_output_size: 80
+  decoder_kernel_size: 3
+  decoder_dilations: [1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 1]
+
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 300
+num_snapshots: 5
+
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
--- a/examples/speedyspeech/baker/config.py
+++ b/examples/speedyspeech/baker/config.py
@ -0,0 +1,25 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+from yacs.config import CfgNode as Configuration
+
+with open("conf/default.yaml", 'rt') as f:
+    _C = yaml.safe_load(f)
+    _C = Configuration(_C)
+
+
+def get_cfg_default():
+    config = _C.clone()
+    return config
--- a/examples/speedyspeech/baker/frontend.py
+++ b/examples/speedyspeech/baker/frontend.py
@ -0,0 +1,92 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import numpy as np
+import paddle
+import pypinyin
+from pypinyin import lazy_pinyin, Style
+import jieba
+import phkit
+phkit.initialize()
+from parakeet.frontend.vocab import Vocab
+
+with open("phones.txt", 'rt') as f:
+    phones = [line.strip() for line in f.readlines()]
+
+with open("tones.txt", 'rt') as f:
+    tones = [line.strip() for line in f.readlines()]
+voc_phones = Vocab(phones, start_symbol=None, end_symbol=None)
+voc_tones = Vocab(tones, start_symbol=None, end_symbol=None)
+
+
+def segment(sentence):
+    segments = re.split(r'[：，；。？！]', sentence)
+    segments = [seg for seg in segments if len(seg)]
+    return segments
+
+
+def g2p(sentence):
+    segments = segment(sentence)
+    phones = []
+    phones.append('sil')
+    tones = []
+    tones.append('0')
+
+    for seg in segments:
+        seg = jieba.lcut(seg)
+        initials = lazy_pinyin(
+            seg, neutral_tone_with_five=True, style=Style.INITIALS)
+        finals = lazy_pinyin(
+            seg, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
+        for c, v in zip(initials, finals):
+            # NOTE: post process for pypinyin outputs
+            # we discriminate i, ii and iii
+            if re.match(r'i\d', v):
+                if c in ['z', 'c', 's']:
+                    v = re.sub('i', 'ii', v)
+                elif c in ['zh', 'ch', 'sh', 'r']:
+                    v = re.sub('i', 'iii', v)
+            if c:
+                phones.append(c)
+                tones.append('0')
+            if v:
+                phones.append(v[:-1])
+                tones.append(v[-1])
+        phones.append('sp')
+        tones.append('0')
+    phones[-1] = 'sil'
+    tones[-1] = '0'
+    return (phones, tones)
+
+
+def p2id(voc, phonemes):
+    phone_ids = [voc.lookup(item) for item in phonemes]
+    return np.array(phone_ids, np.int64)
+
+
+def t2id(voc, tones):
+    tone_ids = [voc.lookup(item) for item in tones]
+    return np.array(tone_ids, np.int64)
+
+
+def text_analysis(sentence):
+    phonemes, tones = g2p(sentence)
+    print(sentence)
+    print([p + t if t != '0' else p for p, t in zip(phonemes, tones)])
+    phone_ids = p2id(voc_phones, phonemes)
+    tone_ids = t2id(voc_tones, tones)
+    phones = paddle.to_tensor(phone_ids)
+    tones = paddle.to_tensor(tone_ids)
+    return phones, tones
--- a/examples/speedyspeech/baker/normalize.py
+++ b/examples/speedyspeech/baker/normalize.py
@ -0,0 +1,150 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalize feature files and dump them."""
+
+import argparse
+import logging
+import os
+from copy import copy
+from operator import itemgetter
+from pathlib import Path
+
+import numpy as np
+import yaml
+import jsonlines
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+from parakeet.frontend.vocab import Vocab
+from parakeet.datasets.data_table import DataTable
+
+from config import get_cfg_default
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump normalized feature files.")
+    parser.add_argument(
+        "--stats", type=str, required=True, help="statistics file.")
+    parser.add_argument(
+        "--phones",
+        type=str,
+        default="phones.txt",
+        help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones", type=str, default="tones.txt", help="tone vocabulary file.")
+    parser.add_argument(
+        "--config", type=str, help="yaml format configuration file.")
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)")
+    args = parser.parse_args()
+
+    # set logger
+    if args.verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+    elif args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+        logging.warning('Skip DEBUG/INFO messages')
+
+    # load config
+    config = get_cfg_default()
+    if args.config:
+        config.merge_from_file(args.config)
+
+    # check directory existence
+    dumpdir = Path(args.dumpdir).resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(metadata, converters={'feats': np.load, })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    # restore scaler
+    scaler = StandardScaler()
+    scaler.mean_ = np.load(args.stats)[0]
+    scaler.scale_ = np.load(args.stats)[1]
+
+    # from version 0.23.0, this information is needed
+    scaler.n_features_in_ = scaler.mean_.shape[0]
+
+    with open(args.phones, 'rt') as f:
+        phones = [line.strip() for line in f.readlines()]
+
+    with open(args.tones, 'rt') as f:
+        tones = [line.strip() for line in f.readlines()]
+    voc_phones = Vocab(phones, start_symbol=None, end_symbol=None)
+    voc_tones = Vocab(tones, start_symbol=None, end_symbol=None)
+
+    # process each file
+    output_metadata = []
+
+    for item in tqdm(dataset):
+        utt_id = item['utt_id']
+        mel = item['feats']
+        # normalize
+        mel = scaler.transform(mel)
+
+        # save
+        mel_path = dumpdir / f"{utt_id}-feats.npy"
+        np.save(mel_path, mel.astype(np.float32), allow_pickle=False)
+        phone_ids = [voc_phones.lookup(p) for p in item['phones']]
+        tone_ids = [voc_tones.lookup(t) for t in item['tones']]
+        output_metadata.append({
+            'utt_id': utt_id,
+            'phones': phone_ids,
+            'tones': tone_ids,
+            'num_phones': item['num_phones'],
+            'num_frames': item['num_frames'],
+            'durations': item['durations'],
+            'feats': str(mel_path),
+        })
+    output_metadata.sort(key=itemgetter('utt_id'))
+    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
+    with jsonlines.open(output_metadata_path, 'w') as writer:
+        for item in output_metadata:
+            writer.write(item)
+    logging.info(f"metadata dumped into {output_metadata_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/speedyspeech/baker/phones.txt
+++ b/examples/speedyspeech/baker/phones.txt
@ -0,0 +1,99 @@
+b
+p
+m
+f
+d
+t
+n
+l
+g
+k
+h
+zh
+ch
+sh
+r
+z
+c
+s
+j
+q
+x
+a
+ar
+ai
+air
+ao
+aor
+an
+anr
+ang
+angr
+e
+er
+ei
+eir
+en
+enr
+eng
+engr
+o
+or
+ou
+our
+ong
+ongr
+ii
+iir
+iii
+iiir
+i
+ir
+ia
+iar
+iao
+iaor
+ian
+ianr
+iang
+iangr
+ie
+ier
+io
+ior
+iou
+iour
+iong
+iongr
+in
+inr
+ing
+ingr
+u
+ur
+ua
+uar
+uai
+uair
+uan
+uanr
+uang
+uangr
+uei
+ueir
+uo
+uor
+uen
+uenr
+ueng
+uengr
+v
+vr
+ve
+ver
+van
+vanr
+vn
+vnr
+sil
+sp
--- a/examples/speedyspeech/baker/preprocess.py
+++ b/examples/speedyspeech/baker/preprocess.py
@ -0,0 +1,309 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Dict, Any
+import soundfile as sf
+import librosa
+import numpy as np
+import argparse
+import yaml
+import json
+import re
+import jsonlines
+import concurrent.futures
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
+from pathlib import Path
+import tqdm
+from operator import itemgetter
+from praatio import tgio
+import logging
+
+from config import get_cfg_default
+from tg_utils import validate_textgrid
+
+
+def logmelfilterbank(audio,
+                     sr,
+                     n_fft=1024,
+                     hop_length=256,
+                     win_length=None,
+                     window="hann",
+                     n_mels=80,
+                     fmin=None,
+                     fmax=None,
+                     eps=1e-10):
+    """Compute log-Mel filterbank feature.
+
+    Parameters
+    ----------
+    audio : ndarray
+        Audio signal (T,).
+    sr : int
+        Sampling rate.
+    n_fft : int
+        FFT size. (Default value = 1024)
+    hop_length : int
+        Hop size. (Default value = 256)
+    win_length : int
+        Window length. If set to None, it will be the same as fft_size. (Default value = None)
+    window : str
+        Window function type. (Default value = "hann")
+    n_mels : int
+        Number of mel basis. (Default value = 80)
+    fmin : int
+        Minimum frequency in mel basis calculation. (Default value = None)
+    fmax : int
+        Maximum frequency in mel basis calculation. (Default value = None)
+    eps : float
+        Epsilon value to avoid inf in log calculation. (Default value = 1e-10)
+
+    Returns
+    -------
+    np.ndarray
+        Log Mel filterbank feature (#frames, num_mels).
+
+    """
+    # get amplitude spectrogram
+    x_stft = librosa.stft(
+        audio,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        window=window,
+        pad_mode="reflect")
+    spc = np.abs(x_stft)  # (#bins, #frames,)
+
+    # get mel basis
+    fmin = 0 if fmin is None else fmin
+    fmax = sr / 2 if fmax is None else fmax
+    mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
+
+    return np.log10(np.maximum(eps, np.dot(mel_basis, spc)))
+
+
+def process_sentence(config: Dict[str, Any],
+                     fp: Path,
+                     alignment_fp: Path,
+                     output_dir: Path):
+    utt_id = fp.stem
+
+    # reading
+    y, sr = librosa.load(fp, sr=config.sr)  # resampling may occur
+    assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
+    assert np.abs(y).max(
+    ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+    duration = librosa.get_duration(y, sr=sr)
+
+    # intervals with empty lables are ignored
+    alignment = tgio.openTextgrid(alignment_fp)
+
+    # validate text grid against audio file
+    num_samples = y.shape[0]
+    validate_textgrid(alignment, num_samples, sr)
+
+    # only with baker's annotation
+    intervals = alignment.tierDict[alignment.tierNameList[0]].entryList
+
+    first, last = intervals[0], intervals[-1]
+    if not (first.label == "sil" and first.end < duration):
+        logging.warning(
+            f" There is something wrong with the fisrt interval {first} in utterance: {utt_id}"
+        )
+    if not (last.label == "sil" and last.start < duration):
+        logging.warning(
+            f" There is something wrong with the last interval {last} in utterance: {utt_id}"
+        )
+
+    logmel = logmelfilterbank(
+        y,
+        sr=sr,
+        n_fft=config.n_fft,
+        window=config.window,
+        win_length=config.win_length,
+        hop_length=config.hop_length,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax)
+
+    # extract phone and duration
+    phones = []
+    tones = []
+    ends = []
+    durations_sec = []
+
+    for interval in intervals:
+        label = interval.label
+        label = label.replace("sp1", "sp")  # Baker has sp1 rather than sp
+
+        # split tone from finals
+        match = re.match(r'^(\w+)([012345])$', label)
+        if match:
+            phones.append(match.group(1))
+            tones.append(match.group(2))
+        else:
+            phones.append(label)
+            tones.append('0')
+        end = min(duration, interval.end)
+        ends.append(end)
+        durations_sec.append(end - interval.start)  # duration in seconds
+
+    frame_pos = librosa.time_to_frames(
+        ends, sr=sr, hop_length=config.hop_length)
+    durations_frame = np.diff(frame_pos, prepend=0)
+
+    num_frames = logmel.shape[-1]  # number of frames of the spectrogram
+    extra = np.sum(durations_frame) - num_frames
+    assert extra <= 0, (
+        f"Number of frames inferred from alignemnt is "
+        f"larger than number of frames of the spectrogram by {extra} frames")
+    durations_frame[-1] += (-extra)
+
+    assert np.sum(durations_frame) == num_frames
+    durations_frame = durations_frame.tolist()
+
+    mel_path = output_dir / (utt_id + "_feats.npy")
+    np.save(mel_path, logmel.T)  # (num_frames, n_mels)
+    record = {
+        "utt_id": utt_id,
+        "phones": phones,
+        "tones": tones,
+        "num_phones": len(phones),
+        "num_frames": num_frames,
+        "durations": durations_frame,
+        "feats": str(mel_path.resolve()),  # use absolute path
+    }
+    return record
+
+
+def process_sentences(config,
+                      fps: List[Path],
+                      alignment_fps: List[Path],
+                      output_dir: Path,
+                      nprocs: int=1):
+    if nprocs == 1:
+        results = []
+        for fp, alignment_fp in tqdm.tqdm(
+                zip(fps, alignment_fps), total=len(fps)):
+            results.append(
+                process_sentence(config, fp, alignment_fp, output_dir))
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp, alignment_fp in zip(fps, alignment_fps):
+                    future = pool.submit(process_sentence, config, fp,
+                                         alignment_fp, output_dir)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+
+                results = []
+                for ft in futures:
+                    results.append(ft.result())
+
+    results.sort(key=itemgetter("utt_id"))
+    with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
+        for item in results:
+            writer.write(item)
+    print("Done")
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+    parser.add_argument(
+        "--rootdir",
+        default=None,
+        type=str,
+        help="directory to baker dataset.")
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+    parser.add_argument(
+        "--config", type=str, help="yaml format configuration file.")
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)")
+    parser.add_argument(
+        "--num_cpu", type=int, default=1, help="number of process.")
+    args = parser.parse_args()
+
+    C = get_cfg_default()
+    if args.config:
+        C.merge_from_file(args.config)
+        C.freeze()
+
+    if args.verbose > 1:
+        print(vars(args))
+        print(C)
+
+    root_dir = Path(args.rootdir).expanduser()
+    dumpdir = Path(args.dumpdir).expanduser()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    wav_files = sorted(list((root_dir / "Wave").rglob("*.wav")))
+    alignment_files = sorted(
+        list((root_dir / "PhoneLabeling").rglob("*.interval")))
+
+    # filter out several files that have errors in annotation
+    exclude = {'000611', '000662', '002365', '005107'}
+    wav_files = [f for f in wav_files if f.stem not in exclude]
+    alignment_files = [f for f in alignment_files if f.stem not in exclude]
+
+    # split data into 3 sections
+    num_train = 9800
+    num_dev = 100
+
+    train_wav_files = wav_files[:num_train]
+    dev_wav_files = wav_files[num_train:num_train + num_dev]
+    test_wav_files = wav_files[num_train + num_dev:]
+
+    train_alignment_files = alignment_files[:num_train]
+    dev_alignment_files = alignment_files[num_train:num_train + num_dev]
+    test_alignment_files = alignment_files[num_train + num_dev:]
+
+    train_dump_dir = dumpdir / "train" / "raw"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = dumpdir / "dev" / "raw"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = dumpdir / "test" / "raw"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+
+    # process for the 3 sections
+    process_sentences(
+        C,
+        train_wav_files,
+        train_alignment_files,
+        train_dump_dir,
+        nprocs=args.num_cpu)
+    process_sentences(
+        C,
+        dev_wav_files,
+        dev_alignment_files,
+        dev_dump_dir,
+        nprocs=args.num_cpu)
+    process_sentences(
+        C,
+        test_wav_files,
+        test_alignment_files,
+        test_dump_dir,
+        nprocs=args.num_cpu)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/speedyspeech/baker/preprocess.sh
+++ b/examples/speedyspeech/baker/preprocess.sh
@ -0,0 +1,6 @@
+python preprocess.py --rootdir=~/datasets/BZNSYP/ --dumpdir=dump --num_cpu=20
+python compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" --output=dump/train/stats.npy
+
+python normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/stats.npy
+python normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/stats.npy
+python normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/stats.npy
--- a/examples/speedyspeech/baker/run.sh
+++ b/examples/speedyspeech/baker/run.sh
@ -0,0 +1,6 @@
+python train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=conf/default.yaml \
+    --output-dir=exp/default \
+    --nprocs=1
--- a/examples/speedyspeech/baker/sentences.txt
+++ b/examples/speedyspeech/baker/sentences.txt
@ -0,0 +1,16 @@
+001 凯莫瑞安联合体的经济崩溃，迫在眉睫。
+002 对于所有想要离开那片废土，去寻找更美好生活的人来说。
+003 克哈，是你们所有人安全的港湾。
+004 为了保护尤摩扬人民不受异虫的残害，我所做的，比他们自己的领导委员会都多。
+005 无论他们如何诽谤我，我将继续为所有泰伦人的最大利益，而努力奋斗。
+006 身为你们的元首，我带领泰伦人实现了人类统治领地和经济的扩张。
+007 我们将继续成长，用行动回击那些只会说风凉话，不愿意和我们相向而行的害群之马。
+008 帝国武装力量，无数的优秀儿女，正时刻守卫着我们的家园大门，但是他们孤木难支。
+009 凡是今天应征入伍者，所获的所有刑罚罪责，减半。
+010 激进分子和异见者希望你们一听见枪声，就背弃多年的和平与繁荣。
+011 他们没有勇气和能力，带领人类穿越一个充满危险的星系。
+012 法治是我们的命脉，然而它却受到前所未有的挑战。
+013 我将恢复我们帝国的荣光，绝不会向任何外星势力低头。
+014 我已经驯服了异虫，荡平了星灵。如今它们的创造者，想要夺走我们拥有的一切。
+015 永远记住，谁才是最能保护你们的人。
+016 不要听信别人的谗言，我不是什么克隆人。
--- a/examples/speedyspeech/baker/speedyspeech_updater.py
+++ b/examples/speedyspeech/baker/speedyspeech_updater.py
@ -0,0 +1,110 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.nn import functional as F
+from paddle.fluid.layers import huber_loss
+
+from parakeet.modules.ssim import ssim
+from parakeet.modules.losses import masked_l1_loss, weighted_mean
+from parakeet.training.reporter import report
+from parakeet.training.updaters.standard_updater import StandardUpdater
+from parakeet.training.extensions.evaluator import StandardEvaluator
+from parakeet.models.speedyspeech import SpeedySpeech
+
+
+class SpeedySpeechUpdater(StandardUpdater):
+    def update_core(self, batch):
+        decoded, predicted_durations = self.model(
+            text=batch["phones"],
+            tones=batch["tones"],
+            plens=batch["num_phones"],
+            durations=batch["durations"])
+
+        target_mel = batch["feats"]
+        spec_mask = F.sequence_mask(
+            batch["num_frames"], dtype=target_mel.dtype).unsqueeze(-1)
+        text_mask = F.sequence_mask(
+            batch["num_phones"], dtype=predicted_durations.dtype)
+
+        # spec loss
+        l1_loss = masked_l1_loss(decoded, target_mel, spec_mask)
+
+        # duration loss
+        target_durations = batch["durations"]
+        target_durations = paddle.maximum(
+            target_durations.astype(predicted_durations.dtype),
+            paddle.to_tensor([1.0]))
+        duration_loss = weighted_mean(
+            huber_loss(
+                predicted_durations, paddle.log(target_durations), delta=1.0),
+            text_mask, )
+
+        # ssim loss
+        ssim_loss = 1.0 - ssim((decoded * spec_mask).unsqueeze(1),
+                               (target_mel * spec_mask).unsqueeze(1))
+
+        loss = l1_loss + ssim_loss + duration_loss
+
+        optimizer = self.optimizer
+        optimizer.clear_grad()
+        loss.backward()
+        optimizer.step()
+
+        report("train/loss", float(loss))
+        report("train/l1_loss", float(l1_loss))
+        report("train/duration_loss", float(duration_loss))
+        report("train/ssim_loss", float(ssim_loss))
+
+
+class SpeedySpeechEvaluator(StandardEvaluator):
+    def evaluate_core(self, batch):
+        print("fire")
+        decoded, predicted_durations = self.model(
+            text=batch["phones"],
+            tones=batch["tones"],
+            plens=batch["num_phones"],
+            durations=batch["durations"])
+
+        target_mel = batch["feats"]
+        spec_mask = F.sequence_mask(
+            batch["num_frames"], dtype=target_mel.dtype).unsqueeze(-1)
+        text_mask = F.sequence_mask(
+            batch["num_phones"], dtype=predicted_durations.dtype)
+
+        # spec loss
+        l1_loss = masked_l1_loss(decoded, target_mel, spec_mask)
+
+        # duration loss
+        target_durations = batch["durations"]
+        target_durations = paddle.maximum(
+            target_durations.astype(predicted_durations.dtype),
+            paddle.to_tensor([1.0]))
+        duration_loss = weighted_mean(
+            huber_loss(
+                predicted_durations, paddle.log(target_durations), delta=1.0),
+            text_mask, )
+
+        # ssim loss
+        ssim_loss = 1.0 - ssim((decoded * spec_mask).unsqueeze(1),
+                               (target_mel * spec_mask).unsqueeze(1))
+
+        loss = l1_loss + ssim_loss + duration_loss
+
+        # import pdb; pdb.set_trace()
+
+        report("eval/loss", float(loss))
+        report("eval/l1_loss", float(l1_loss))
+        report("eval/duration_loss", float(duration_loss))
+        report("eval/ssim_loss", float(ssim_loss))
--- a/examples/speedyspeech/baker/synthesize.py
+++ b/examples/speedyspeech/baker/synthesize.py
@ -0,0 +1,146 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import logging
+import argparse
+import dataclasses
+from pathlib import Path
+
+import yaml
+import jsonlines
+import paddle
+import numpy as np
+import soundfile as sf
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+from paddle import distributed as dist
+from yacs.config import CfgNode
+
+from parakeet.datasets.data_table import DataTable
+from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference
+from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
+from parakeet.modules.normalizer import ZScore
+
+
+def evaluate(args, speedyspeech_config, pwg_config):
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for evaluation
+    with jsonlines.open(args.test_metadata, 'r') as reader:
+        test_metadata = list(reader)
+    test_dataset = DataTable(
+        data=test_metadata, fields=["utt_id", "phones", "tones"])
+
+    model = SpeedySpeech(**speedyspeech_config["model"])
+    model.set_state_dict(
+        paddle.load(args.speedyspeech_checkpoint)["main_params"])
+    model.eval()
+
+    vocoder = PWGGenerator(**pwg_config["generator_params"])
+    vocoder.set_state_dict(paddle.load(args.pwg_params))
+    vocoder.remove_weight_norm()
+    vocoder.eval()
+    print("model done!")
+
+    stat = np.load(args.speedyspeech_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    speedyspeech_normalizer = ZScore(mu, std)
+
+    stat = np.load(args.pwg_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    pwg_normalizer = ZScore(mu, std)
+
+    speedyspeech_inferencce = SpeedySpeechInference(speedyspeech_normalizer,
+                                                    model)
+    pwg_inference = PWGInference(pwg_normalizer, vocoder)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for datum in test_dataset:
+        utt_id = datum["utt_id"]
+        phones = paddle.to_tensor(datum["phones"])
+        tones = paddle.to_tensor(datum["tones"])
+
+        with paddle.no_grad():
+            wav = pwg_inference(speedyspeech_inferencce(phones, tones))
+        sf.write(
+            output_dir / (utt_id + ".wav"),
+            wav.numpy(),
+            samplerate=speedyspeech_config.sr)
+        print(f"{utt_id} done!")
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with speedyspeech & parallel wavegan.")
+    parser.add_argument(
+        "--speedyspeech-config",
+        type=str,
+        help="config file to overwrite default config")
+    parser.add_argument(
+        "--speedyspeech-checkpoint",
+        type=str,
+        help="speedyspeech checkpoint to load.")
+    parser.add_argument(
+        "--speedyspeech-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
+    )
+    parser.add_argument(
+        "--pwg-config",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
+    )
+    parser.add_argument(
+        "--pwg-params",
+        type=str,
+        help="parallel wavegan generator parameters to load.")
+    parser.add_argument(
+        "--pwg-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
+    )
+    parser.add_argument("--test-metadata", type=str, help="test metadata")
+    parser.add_argument("--output-dir", type=str, help="output dir")
+    parser.add_argument(
+        "--device", type=str, default="gpu", help="device type to use")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose")
+
+    args = parser.parse_args()
+    with open(args.speedyspeech_config) as f:
+        speedyspeech_config = CfgNode(yaml.safe_load(f))
+    with open(args.pwg_config) as f:
+        pwg_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(speedyspeech_config)
+    print(pwg_config)
+
+    evaluate(args, speedyspeech_config, pwg_config)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/speedyspeech/baker/synthesize.sh
+++ b/examples/speedyspeech/baker/synthesize.sh
@ -0,0 +1,10 @@
+python synthesize.py \
+  --speedyspeech-config=conf/default.yaml \
+  --speedyspeech-checkpoint=exp/debug/checkpoints/snapshot_iter_91800.pdz \
+  --speedyspeech-stat=dump/train/stats.npy \
+  --pwg-config=../../parallelwave_gan/baker/conf/default.yaml \
+  --pwg-params=../../parallelwave_gan/baker/converted.pdparams \
+  --pwg-stat=../../parallelwave_gan/baker/dump/train/stats.npy \
+  --test-metadata=dump/test/norm/metadata.jsonl \
+  --output-dir=exp/debug/test \
+  --device="gpu"
--- a/examples/speedyspeech/baker/synthesize_e2e.py
+++ b/examples/speedyspeech/baker/synthesize_e2e.py
@ -0,0 +1,150 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import logging
+import argparse
+import dataclasses
+from pathlib import Path
+
+import yaml
+import jsonlines
+import paddle
+import numpy as np
+import soundfile as sf
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+from paddle import distributed as dist
+from yacs.config import CfgNode
+
+from parakeet.datasets.data_table import DataTable
+from parakeet.models.speedyspeech import SpeedySpeech, SpeedySpeechInference
+from parakeet.models.parallel_wavegan import PWGGenerator, PWGInference
+from parakeet.modules.normalizer import ZScore
+
+from frontend import text_analysis
+
+
+def evaluate(args, speedyspeech_config, pwg_config):
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for evaluation
+    sentences = []
+    with open(args.text, 'rt') as f:
+        for line in f:
+            utt_id, sentence = line.strip().split()
+            sentences.append((utt_id, sentence))
+
+    model = SpeedySpeech(**speedyspeech_config["model"])
+    model.set_state_dict(
+        paddle.load(args.speedyspeech_checkpoint)["main_params"])
+    model.eval()
+
+    vocoder = PWGGenerator(**pwg_config["generator_params"])
+    vocoder.set_state_dict(paddle.load(args.pwg_params))
+    vocoder.remove_weight_norm()
+    vocoder.eval()
+    print("model done!")
+
+    stat = np.load(args.speedyspeech_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    speedyspeech_normalizer = ZScore(mu, std)
+
+    stat = np.load(args.pwg_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    pwg_normalizer = ZScore(mu, std)
+
+    speedyspeech_inferencce = SpeedySpeechInference(speedyspeech_normalizer,
+                                                    model)
+    pwg_inference = PWGInference(pwg_normalizer, vocoder)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for utt_id, sentence in sentences:
+        phones, tones = text_analysis(sentence)
+
+        with paddle.no_grad():
+            wav = pwg_inference(speedyspeech_inferencce(phones, tones))
+        sf.write(
+            output_dir / (utt_id + ".wav"),
+            wav.numpy(),
+            samplerate=speedyspeech_config.sr)
+        print(f"{utt_id} done!")
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with speedyspeech & parallel wavegan.")
+    parser.add_argument(
+        "--speedyspeech-config",
+        type=str,
+        help="config file to overwrite default config")
+    parser.add_argument(
+        "--speedyspeech-checkpoint",
+        type=str,
+        help="speedyspeech checkpoint to load.")
+    parser.add_argument(
+        "--speedyspeech-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
+    )
+    parser.add_argument(
+        "--pwg-config",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
+    )
+    parser.add_argument(
+        "--pwg-params",
+        type=str,
+        help="parallel wavegan generator parameters to load.")
+    parser.add_argument(
+        "--pwg-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
+    )
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument("--output-dir", type=str, help="output dir")
+    parser.add_argument(
+        "--device", type=str, default="gpu", help="device type to use")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose")
+
+    args = parser.parse_args()
+    with open(args.speedyspeech_config) as f:
+        speedyspeech_config = CfgNode(yaml.safe_load(f))
+    with open(args.pwg_config) as f:
+        pwg_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(speedyspeech_config)
+    print(pwg_config)
+
+    evaluate(args, speedyspeech_config, pwg_config)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/speedyspeech/baker/synthesize_e2e.sh
+++ b/examples/speedyspeech/baker/synthesize_e2e.sh
@ -0,0 +1,10 @@
+python synthesize_e2e.py \
+  --speedyspeech-config=conf/default.yaml \
+  --speedyspeech-checkpoint=exp/debug/checkpoints/snapshot_iter_91800.pdz \
+  --speedyspeech-stat=dump/train/stats.npy \
+  --pwg-config=../../parallelwave_gan/baker/conf/default.yaml \
+  --pwg-params=../../parallelwave_gan/baker/converted.pdparams \
+  --pwg-stat=../../parallelwave_gan/baker/dump/train/stats.npy \
+  --text=sentences.txt \
+  --output-dir=exp/e2e \
+  --device="gpu"
--- a/examples/speedyspeech/baker/tg_utils.py
+++ b/examples/speedyspeech/baker/tg_utils.py
@ -0,0 +1,27 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import librosa
+from praatio import tgio
+
+
+def validate_textgrid(text_grid, num_samples, sr):
+    """Validate Text Grid to make sure that the time interval annotated 
+    by the tex grid file does not go beyond the audio file.
+    """
+    start = text_grid.minTimestamp
+    end = text_grid.maxTimestamp
+
+    end_audio = librosa.samples_to_time(num_samples, sr)
+    return start == 0.0 and end <= end_audio
--- a/examples/speedyspeech/baker/tones.txt
+++ b/examples/speedyspeech/baker/tones.txt
@ -0,0 +1,6 @@
+0
+1
+2
+3
+4
+5
--- a/examples/speedyspeech/baker/train.py
+++ b/examples/speedyspeech/baker/train.py
@ -0,0 +1,186 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import logging
+import argparse
+import dataclasses
+from pathlib import Path
+
+import yaml
+import jsonlines
+import paddle
+import numpy as np
+from paddle import nn
+from paddle.nn import functional as F
+from paddle import distributed as dist
+from paddle.io import DataLoader, DistributedBatchSampler
+from paddle.optimizer import Adam  # No RAdaom
+from paddle.optimizer.lr import StepDecay
+from paddle import DataParallel
+from visualdl import LogWriter
+
+from parakeet.datasets.data_table import DataTable
+from parakeet.models.speedyspeech import SpeedySpeech
+
+from parakeet.training.updater import UpdaterBase
+from parakeet.training.trainer import Trainer
+from parakeet.training.reporter import report
+from parakeet.training import extension
+from parakeet.training.extensions.snapshot import Snapshot
+from parakeet.training.extensions.visualizer import VisualDL
+from parakeet.training.seeding import seed_everything
+
+from batch_fn import collate_baker_examples
+from speedyspeech_updater import SpeedySpeechUpdater, SpeedySpeechEvaluator
+from config import get_cfg_default
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    if not paddle.is_compiled_with_cuda:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+        world_size = paddle.distributed.get_world_size()
+        if world_size > 1:
+            paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=[
+            "phones", "tones", "num_phones", "num_frames", "feats", "durations"
+        ],
+        converters={"feats": np.load, }, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=[
+            "phones", "tones", "num_phones", "num_frames", "feats", "durations"
+        ],
+        converters={"feats": np.load, }, )
+
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=False,
+        drop_last=True)
+    # dev_sampler = DistributedBatchSampler(dev_dataset,
+    #                                       batch_size=config.batch_size,
+    #                                       shuffle=False,
+    #                                       drop_last=False)
+    print("samplers done!")
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=collate_baker_examples,
+        num_workers=config.num_workers)
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        shuffle=False,
+        drop_last=False,
+        batch_size=config.batch_size,
+        collate_fn=collate_baker_examples,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+
+    # batch = collate_baker_examples([train_dataset[i] for i in range(10)])
+    # # batch = collate_baker_examples([dev_dataset[i] for i in range(10)])
+    # import pdb; pdb.set_trace()
+    model = SpeedySpeech(**config["model"])
+    if world_size > 1:
+        model = DataParallel(model)  # TODO, do not use vocab size from config
+    # print(model)
+    print("model done!")
+    optimizer = Adam(
+        0.001,
+        parameters=model.parameters(),
+        grad_clip=nn.ClipGradByGlobalNorm(5.0))
+    print("optimizer done!")
+
+    updater = SpeedySpeechUpdater(
+        model=model, optimizer=optimizer, dataloader=train_dataloader)
+
+    output_dir = Path(args.output_dir)
+    trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
+
+    evaluator = SpeedySpeechEvaluator(model, dev_dataloader)
+
+    if dist.get_rank() == 0:
+        trainer.extend(evaluator, trigger=(1, "epoch"))
+        writer = LogWriter(str(output_dir))
+        trainer.extend(VisualDL(writer), trigger=(1, "iteration"))
+        trainer.extend(
+            Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
+    print(trainer.extensions)
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(description="Train a ParallelWaveGAN "
+                                     "model with Baker Mandrin TTS dataset.")
+    parser.add_argument(
+        "--config", type=str, help="config file to overwrite default config")
+    parser.add_argument("--train-metadata", type=str, help="training data")
+    parser.add_argument("--dev-metadata", type=str, help="dev data")
+    parser.add_argument("--output-dir", type=str, help="output dir")
+    parser.add_argument(
+        "--device", type=str, default="gpu", help="device type to use")
+    parser.add_argument(
+        "--nprocs", type=int, default=1, help="number of processes")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose")
+
+    args = parser.parse_args()
+    if args.device == "cpu" and args.nprocs > 1:
+        raise RuntimeError("Multiprocess training on CPU is not supported.")
+    config = get_cfg_default()
+    if args.config:
+        config.merge_from_file(args.config)
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.nprocs > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.nprocs)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
--- a/parakeet/data/batch.py
+++ b/parakeet/data/batch.py
@ -161,3 +161,27 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
                       mode='constant',
                       constant_values=pad_value))
    return np.array(batch, dtype=dtype), np.array(lengths, dtype=np.int64)
+
+
+def batch_sequences(sequences, axis=0, pad_value=0):
+    # import pdb; pdb.set_trace()
+    seq = sequences[0]
+    ndim = seq.ndim
+    if axis < 0:
+        axis += ndim
+    dtype = seq.dtype
+    pad_value = dtype.type(pad_value)
+    seq_lengths = [seq.shape[axis] for seq in sequences]
+    max_length = np.max(seq_lengths)
+
+    padded_sequences = []
+    for seq, length in zip(sequences, seq_lengths):
+        padding = [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (
+            ndim - axis - 1)
+        padded_seq = np.pad(seq,
+                            padding,
+                            mode='constant',
+                            constant_values=pad_value)
+        padded_sequences.append(padded_seq)
+    batch = np.stack(padded_sequences)
+    return batch
--- a/parakeet/models/parallel_wavegan.py
+++ b/parakeet/models/parallel_wavegan.py
@ -768,3 +768,15 @@ class ResidualPWGDiscriminator(nn.Layer):
                pass

        self.apply(_remove_weight_norm)
+
+
+class PWGInference(nn.Layer):
+    def __init__(self, normalizer, pwg_generator):
+        super().__init__()
+        self.normalizer = normalizer
+        self.pwg_generator = pwg_generator
+
+    def forward(self, logmel):
+        normalized_mel = self.normalizer(logmel)
+        wav = self.pwg_generator.inference(normalized_mel)
+        return wav
--- a/parakeet/models/speedyspeech.py
+++ b/parakeet/models/speedyspeech.py
@ -0,0 +1,226 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+import paddle
+from paddle import Tensor
+from paddle import nn
+from paddle.nn import functional as F
+from paddle.nn import initializer as I
+
+from parakeet.modules.positional_encoding import sinusoid_position_encoding
+from parakeet.modules.expansion import expand
+
+
+class ResidualBlock(nn.Layer):
+    def __init__(self, channels, kernel_size, dilation, n=2):
+        super().__init__()
+        blocks = [
+            nn.Sequential(
+                nn.Conv1D(
+                    channels,
+                    channels,
+                    kernel_size,
+                    dilation=dilation,
+                    padding="same",
+                    data_format="NLC"),
+                nn.ReLU(),
+                nn.BatchNorm1D(
+                    channels, data_format="NLC"), ) for _ in range(n)
+        ]
+        self.blocks = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        return x + self.blocks(x)
+
+
+class TextEmbedding(nn.Layer):
+    def __init__(self,
+                 vocab_size: int,
+                 embedding_size: int,
+                 tone_vocab_size: int=None,
+                 tone_embedding_size: int=None,
+                 padding_idx: int=None,
+                 tone_padding_idx: int=None,
+                 concat: bool=False):
+        super().__init__()
+        self.text_embedding = nn.Embedding(vocab_size, embedding_size,
+                                           padding_idx)
+        if tone_vocab_size:
+            tone_embedding_size = tone_embedding_size or embedding_size
+            if tone_embedding_size != embedding_size and not concat:
+                raise ValueError(
+                    "embedding size != tone_embedding size, only conat is avaiable."
+                )
+            self.tone_embedding = nn.Embedding(
+                tone_vocab_size, tone_embedding_size, tone_padding_idx)
+        self.concat = concat
+
+    def forward(self, text, tone=None):
+        text_embed = self.text_embedding(text)
+        if tone is None:
+            return text_embed
+        tone_embed = self.tone_embedding(tone)
+        if self.concat:
+            embed = paddle.concat([text_embed, tone_embed], -1)
+        else:
+            embed = text_embed + tone_embed
+        return embed
+
+
+class SpeedySpeechEncoder(nn.Layer):
+    def __init__(self, vocab_size, tone_size, hidden_size, kernel_size,
+                 dilations):
+        super().__init__()
+        self.embedding = TextEmbedding(
+            vocab_size,
+            hidden_size,
+            tone_size,
+            padding_idx=0,
+            tone_padding_idx=0)
+        self.prenet = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size),
+            nn.ReLU(), )
+        res_blocks = [
+            ResidualBlock(
+                hidden_size, kernel_size, d, n=2) for d in dilations
+        ]
+        self.res_blocks = nn.Sequential(*res_blocks)
+
+        self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
+        self.postnet2 = nn.Sequential(
+            nn.ReLU(),
+            nn.BatchNorm1D(
+                hidden_size, data_format="NLC"),
+            nn.Linear(hidden_size, hidden_size), )
+
+    def forward(self, text, tones):
+        embedding = self.embedding(text, tones)
+        embedding = self.prenet(embedding)
+        x = self.res_blocks(embedding)
+        x = embedding + self.postnet1(x)
+        x = self.postnet2(x)
+        return x
+
+
+class DurationPredictor(nn.Layer):
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.layers = nn.Sequential(
+            ResidualBlock(
+                hidden_size, 4, 1, n=1),
+            ResidualBlock(
+                hidden_size, 3, 1, n=1),
+            ResidualBlock(
+                hidden_size, 1, 1, n=1),
+            nn.Linear(hidden_size, 1))
+
+    def forward(self, x):
+        return paddle.squeeze(self.layers(x), -1)
+
+
+class SpeedySpeechDecoder(nn.Layer):
+    def __init__(self, hidden_size, output_size, kernel_size, dilations):
+        super().__init__()
+        res_blocks = [
+            ResidualBlock(
+                hidden_size, kernel_size, d, n=2) for d in dilations
+        ]
+        self.res_blocks = nn.Sequential(*res_blocks)
+
+        self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
+        self.postnet2 = nn.Sequential(
+            ResidualBlock(
+                hidden_size, kernel_size, 1, n=2),
+            nn.Linear(hidden_size, output_size))
+
+    def forward(self, x):
+        xx = self.res_blocks(x)
+        x = x + self.postnet1(xx)
+        x = self.postnet2(x)
+        return x
+
+
+class SpeedySpeech(nn.Layer):
+    def __init__(
+            self,
+            vocab_size,
+            encoder_hidden_size,
+            encoder_kernel_size,
+            encoder_dilations,
+            duration_predictor_hidden_size,
+            decoder_hidden_size,
+            decoder_output_size,
+            decoder_kernel_size,
+            decoder_dilations,
+            tone_size=None, ):
+        super().__init__()
+        encoder = SpeedySpeechEncoder(vocab_size, tone_size,
+                                      encoder_hidden_size, encoder_kernel_size,
+                                      encoder_dilations)
+        duration_predictor = DurationPredictor(duration_predictor_hidden_size)
+        decoder = SpeedySpeechDecoder(decoder_hidden_size, decoder_output_size,
+                                      decoder_kernel_size, decoder_dilations)
+
+        self.encoder = encoder
+        self.duration_predictor = duration_predictor
+        self.decoder = decoder
+
+    def forward(self, text, tones, plens, durations):
+        encodings = self.encoder(text, tones)
+        pred_durations = self.duration_predictor(encodings.detach())  # (B, T)
+
+        # expand encodings
+        durations_to_expand = durations
+        encodings = expand(encodings, durations_to_expand)
+
+        # decode
+        # remove positional encoding here
+        _, t_dec, feature_size = encodings.shape
+        encodings += sinusoid_position_encoding(t_dec, feature_size)
+        decoded = self.decoder(encodings)
+        return decoded, pred_durations
+
+    def inference(self, text, tones):
+        # text: [T]
+        # tones: [T]
+        text = text.unsqueeze(0)
+        if tones is not None:
+            tones = tones.unsqueeze(0)
+
+        encodings = self.encoder(text, tones)
+        pred_durations = self.duration_predictor(encodings)  # (1, T)
+        durations_to_expand = paddle.round(pred_durations.exp())
+        durations_to_expand = (durations_to_expand).astype(paddle.int64)
+        encodings = expand(encodings, durations_to_expand)
+
+        shape = paddle.shape(encodings)
+        t_dec, feature_size = shape[1], shape[2]
+        encodings += sinusoid_position_encoding(t_dec, feature_size)
+        decoded = self.decoder(encodings)
+        return decoded[0]
+
+
+class SpeedySpeechInference(nn.Layer):
+    def __init__(self, normalizer, speedyspeech_model):
+        super().__init__()
+        self.normalizer = normalizer
+        self.acoustic_model = speedyspeech_model
+
+    def forward(self, phones, tones):
+        normalized_mel = self.acoustic_model.inference(phones, tones)
+        logmel = self.normalizer.inverse(normalized_mel)
+        return logmel
--- a/parakeet/models/transformer_tts.py
+++ b/parakeet/models/transformer_tts.py
@ -403,7 +403,7 @@ class TransformerTTS(nn.Layer):
        else:
            self.toned = False
        # position encoding matrix may be extended later
-        self.encoder_pe = pe.sinusoid_positional_encoding(0, 1000, d_encoder)
+        self.encoder_pe = pe.sinusoid_positional_encoding(1000, d_encoder)
        self.encoder_pe_scalar = self.create_parameter(
            [1], attr=I.Constant(1.))
        self.encoder = TransformerEncoder(d_encoder, n_heads, d_ffn,
@ -411,7 +411,7 @@ class TransformerTTS(nn.Layer):

        # decoder
        self.decoder_prenet = MLPPreNet(d_mel, d_prenet, d_decoder, dropout)
-        self.decoder_pe = pe.sinusoid_positional_encoding(0, 1000, d_decoder)
+        self.decoder_pe = pe.sinusoid_positional_encoding(1000, d_decoder)
        self.decoder_pe_scalar = self.create_parameter(
            [1], attr=I.Constant(1.))
        self.decoder = TransformerDecoder(
@ -488,7 +488,7 @@ class TransformerTTS(nn.Layer):
        # twice its length if needed
        if x.shape[1] * self.r > self.decoder_pe.shape[0]:
            new_T = max(x.shape[1] * self.r, self.decoder_pe.shape[0] * 2)
-            self.decoder_pe = pe.sinusoid_positional_encoding(0, new_T,
+            self.decoder_pe = pe.sinusoid_positional_encoding(new_T,
                                                              self.d_decoder)
        pos_enc = self.decoder_pe[:T_dec * self.r:self.r, :]
        x = x.scale(math.sqrt(
--- a/parakeet/modules/expansion.py
+++ b/parakeet/modules/expansion.py
@ -0,0 +1,39 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+from paddle import Tensor
+
+
+def expand(encodings: Tensor, durations: Tensor) -> Tensor:
+    """
+    encodings: (B, T, C)
+    durations: (B, T)
+    """
+    batch_size, t_enc = durations.shape
+    durations = durations.numpy()
+    slens = np.sum(durations, -1)
+    t_dec = np.max(slens)
+    M = np.zeros([batch_size, t_dec, t_enc])
+    for i in range(batch_size):
+        k = 0
+        for j in range(t_enc):
+            d = durations[i, j]
+            M[i, k:k + d, j] = 1
+            k += d
+    M = paddle.to_tensor(M, dtype=encodings.dtype)
+    encodings = paddle.matmul(M, encodings)
+    return encodings
--- a/parakeet/modules/normalizer.py
+++ b/parakeet/modules/normalizer.py
@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle import nn
+
+
+class ZScore(nn.Layer):
+    # feature last
+    def __init__(self, mu, sigma):
+        super().__init__()
+        self.register_buffer("mu", mu)
+        self.register_buffer("sigma", sigma)
+
+    def forward(self, x):
+        return (x - self.mu) / self.sigma
+
+    def inverse(self, x):
+        return x * self.sigma + self.mu
--- a/parakeet/modules/positional_encoding.py
+++ b/parakeet/modules/positional_encoding.py
@ -14,47 +14,56 @@

 import math
 import numpy as np
+
 import paddle
+from paddle import Tensor
 from paddle.nn import functional as F

-__all__ = ["sinusoid_positional_encoding"]
+__all__ = ["sinusoid_position_encoding", "scaled_position_encoding"]


-def sinusoid_positional_encoding(start_index, length, size, dtype=None):
-    r"""Generate standard positional encoding matrix.
-    
-    .. math::
-    
-        pe(pos, 2i) = sin(\frac{pos}{10000^{\frac{2i}{size}}}) \\
-        pe(pos, 2i+1) = cos(\frac{pos}{10000^{\frac{2i}{size}}})
-    
-    Parameters
-    ----------
-    start_index : int
-        The start index.
-    length : int
-        The timesteps of the positional encoding to generate.
-    size : int 
-        Feature size of positional encoding.
-    
-    Returns
-    -------
-    Tensor [shape=(length, size)]
-        The positional encoding.
-        
-    Raises
-    ------
-    ValueError
-        If ``size`` is not divisible by 2.
-    """
-    if (size % 2 != 0):
+def sinusoid_position_encoding(num_positions: int,
+                               feature_size: int,
+                               omega: float=1.0,
+                               start_pos: int=0,
+                               dtype=None) -> Tensor:
+    # return tensor shape (num_positions, feature_size)
+    if (feature_size % 2 != 0):
        raise ValueError("size should be divisible by 2")
    dtype = dtype or paddle.get_default_dtype()
-    channel = np.arange(0, size, 2)
-    index = np.arange(start_index, start_index + length, 1)
-    p = np.expand_dims(index, -1) / (10000**(channel / float(size)))
-    encodings = np.zeros([length, size])
-    encodings[:, 0::2] = np.sin(p)
-    encodings[:, 1::2] = np.cos(p)
-    encodings = paddle.to_tensor(encodings)
+
+    channel = paddle.arange(0, feature_size, 2, dtype=dtype)
+    index = paddle.arange(start_pos, start_pos + num_positions, 1, dtype=dtype)
+    p = (paddle.unsqueeze(index, -1) *
+         omega) / (10000.0**(channel / float(feature_size)))
+    encodings = paddle.zeros([num_positions, feature_size], dtype=dtype)
+    encodings[:, 0::2] = paddle.sin(p)
+    encodings[:, 1::2] = paddle.cos(p)
+    return encodings
+
+
+def scaled_position_encoding(num_positions: int,
+                             feature_size: int,
+                             omega: Tensor,
+                             start_pos: int=0,
+                             dtype=None) -> Tensor:
+    # omega: Tensor (batch_size, )
+    # return tensor shape (batch_size, num_positions, feature_size)
+    # consider renaming this as batched positioning encoding
+    if (feature_size % 2 != 0):
+        raise ValueError("size should be divisible by 2")
+    dtype = dtype or paddle.get_default_dtype()
+
+    channel = paddle.arange(0, feature_size, 2, dtype=dtype)
+    index = paddle.arange(
+        start_pos, start_pos + num_positions, 1, dtype=omega.dtype)
+    batch_size = omega.shape[0]
+    omega = paddle.unsqueeze(omega, [1, 2])
+    p = (paddle.unsqueeze(index, -1) *
+         omega) / (10000.0**(channel / float(feature_size)))
+    encodings = paddle.zeros(
+        [batch_size, num_positions, feature_size], dtype=dtype)
+    # it is nice to have fancy indexing and inplace operations
+    encodings[:, :, 0::2] = paddle.sin(p)
+    encodings[:, :, 1::2] = paddle.cos(p)
    return encodings
--- a/parakeet/modules/ssim.py
+++ b/parakeet/modules/ssim.py
@ -0,0 +1,84 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from math import exp
+
+import numpy as np
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+
+def gaussian(window_size, sigma):
+    gauss = paddle.to_tensor([
+        exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
+        for x in range(window_size)
+    ])
+    return gauss / gauss.sum()
+
+
+def create_window(window_size, channel):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = paddle.matmul(_1D_window,
+                               paddle.transpose(_1D_window,
+                                                [1, 0])).unsqueeze([0, 1])
+    window = paddle.expand(_2D_window, [channel, 1, window_size, window_size])
+    return window
+
+
+def _ssim(img1, img2, window, window_size, channel, size_average=True):
+    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+
+    sigma1_sq = F.conv2d(
+        img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
+    sigma2_sq = F.conv2d(
+        img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
+    sigma12 = F.conv2d(
+        img1 * img2, window, padding=window_size // 2,
+        groups=channel) - mu1_mu2
+
+    C1 = 0.01**2
+    C2 = 0.03**2
+
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) \
+             / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+
+    if size_average:
+        return ssim_map.mean()
+    else:
+        return ssim_map.mean(1).mean(1).mean(1)
+
+
+class SSIM(nn.Layer):
+    def __init__(self, window_size=11, size_average=True):
+        super().__init__()
+        self.window_size = window_size
+        self.size_average = size_average
+        self.channel = 1
+        self.window = create_window(window_size, self.channel)
+
+    def forward(self, img1, img2):
+        return _ssim(img1, img2, self.window, self.window_size, self.channel,
+                     self.size_average)
+
+
+def ssim(img1, img2, window_size=11, size_average=True):
+    (_, channel, _, _) = img1.shape
+    window = create_window(window_size, channel)
+    return _ssim(img1, img2, window, window_size, channel, size_average)
--- a/parakeet/training/trainer.py
+++ b/parakeet/training/trainer.py
@ -123,8 +123,6 @@ class Trainer(object):
        update = self.updater.update  # training step
        stop_trigger = self.stop_trigger

-        print(self.updater.state)
-
        # display only one progress bar
        max_iteration = None
        if isinstance(stop_trigger, LimitTrigger):
--- a/parakeet/training/triggers/interval_trigger.py
+++ b/parakeet/training/triggers/interval_trigger.py
@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from copy import deepcopy
+

 class IntervalTrigger(object):
    """A Predicate to do something every N cycle."""
@ -23,9 +25,16 @@ class IntervalTrigger(object):
            raise ValueError("period should be a positive integer.")
        self.period = period
        self.unit = unit
+        self.last_index = None

    def __call__(self, trainer):
-        state = trainer.updater.state
-        index = getattr(state, self.unit)
-        fire = index % self.period == 0
+        if self.last_index is None:
+            last_index = getattr(trainer.updater.state, self.unit)
+            self.last_index = last_index
+
+        last_index = self.last_index
+        index = getattr(trainer.updater.state, self.unit)
+        fire = index // self.period != last_index // self.period
+
+        self.last_index = index
        return fire
--- a/parakeet/training/updaters/standard_updater.py
+++ b/parakeet/training/updaters/standard_updater.py
@ -106,8 +106,8 @@ class StandardUpdater(UpdaterBase):
        self.update_core(batch)

        self.state.iteration += 1
-        if self.updaters_per_epoch is not None:
-            if self.state.iteration % self.updaters_per_epoch == 0:
+        if self.updates_per_epoch is not None:
+            if self.state.iteration % self.updates_per_epoch == 0:
                self.state.epoch += 1

    def update_core(self, batch):
@ -139,7 +139,7 @@ class StandardUpdater(UpdaterBase):
        self.optimizer.update()

    @property
-    def updaters_per_epoch(self):
+    def updates_per_epoch(self):
        """Number of updater per epoch, determined by the length of the 
        dataloader."""
        length_of_dataloader = None
--- a/tests/test_expansion.py
+++ b/tests/test_expansion.py
@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from parakeet.modules import expansion
+
+
+def test_expand():
+    x = paddle.randn([2, 4, 3])  # (B, T, C)
+    lengths = paddle.to_tensor([[1, 2, 2, 1], [3, 1, 4, 0]])
+    y = expansion.expand(x, lengths)
+
+    assert y.shape == [2, 8, 3]
+    print("the first sequence")
+    print(y[0])
+
+    print("the second sequence")
+    print(y[1])
--- a/tests/test_to_static.py
+++ b/tests/test_to_static.py
@ -0,0 +1,34 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+from paddle.jit import to_static
+from paddle.static import InputSpec
+
+
+def test_applicative_evaluation():
+    def m_sqrt2(x):
+        return paddle.scale(x, math.sqrt(2))
+
+    subgraph = to_static(m_sqrt2, input_spec=[InputSpec([-1])])
+    paddle.jit.save(subgraph, './temp_test_to_static')
+
+    fn = paddle.jit.load('./temp_test_to_static')
+    x = paddle.arange(10, dtype=paddle.float32)
+    y = fn(x)
+
+    print(x)
+    print(y)