Merge pull request #64 from PaddlePaddle/doc

Update docstrings
2020-12-18 20:58:59 +08:00 · 2020-12-18 20:58:59 +08:00 · badf72d611
parent cf43f2cf03 544594ec54
commit badf72d611
43 changed files with 2606 additions and 619 deletions
--- a/doc/Makefile
+++ b/doc/Makefile
@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/doc/make.bat
+++ b/doc/make.bat
@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@ -0,0 +1,64 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'parakeet'
+copyright = '2020, parakeet-developers'
+author = 'parakeet-developers'
+
+# The full version, including alpha/beta/rc tags
+release = '0.2'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.viewcode',
+    "sphinx_rtd_theme",	
+    'sphinx.ext.mathjax',
+    'numpydoc',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+source_suffix = ['.rst', '.md']
+
+# -- Extension configuration -------------------------------------------------
+numpydoc_show_class_members = False
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@ -0,0 +1,20 @@
+.. parakeet documentation master file, created by
+   sphinx-quickstart on Thu Dec 17 20:01:34 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to parakeet's documentation!
+====================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
--- a/doc/source/modules.rst
+++ b/doc/source/modules.rst
@ -0,0 +1,7 @@
+parakeet
+========
+
+.. toctree::
+   :maxdepth: 4
+
+   parakeet
--- a/doc/source/parakeet.audio.rst
+++ b/doc/source/parakeet.audio.rst
@ -0,0 +1,29 @@
+parakeet.audio package
+======================
+
+Submodules
+----------
+
+parakeet.audio.audio module
+---------------------------
+
+.. automodule:: parakeet.audio.audio
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.audio.spec\_normalizer module
+--------------------------------------
+
+.. automodule:: parakeet.audio.spec_normalizer
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: parakeet.audio
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/doc/source/parakeet.data.rst
+++ b/doc/source/parakeet.data.rst
@ -0,0 +1,29 @@
+parakeet.data package
+=====================
+
+Submodules
+----------
+
+parakeet.data.batch module
+--------------------------
+
+.. automodule:: parakeet.data.batch
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.data.dataset module
+----------------------------
+
+.. automodule:: parakeet.data.dataset
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: parakeet.data
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/doc/source/parakeet.datasets.rst
+++ b/doc/source/parakeet.datasets.rst
@ -0,0 +1,29 @@
+parakeet.datasets package
+=========================
+
+Submodules
+----------
+
+parakeet.datasets.common module
+-------------------------------
+
+.. automodule:: parakeet.datasets.common
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.datasets.ljspeech module
+---------------------------------
+
+.. automodule:: parakeet.datasets.ljspeech
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: parakeet.datasets
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/doc/source/parakeet.frontend.rst
+++ b/doc/source/parakeet.frontend.rst
@ -0,0 +1,37 @@
+parakeet.frontend package
+=========================
+
+Submodules
+----------
+
+parakeet.frontend.phonectic module
+----------------------------------
+
+.. automodule:: parakeet.frontend.phonectic
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.frontend.punctuation module
+------------------------------------
+
+.. automodule:: parakeet.frontend.punctuation
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.frontend.vocab module
+------------------------------
+
+.. automodule:: parakeet.frontend.vocab
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: parakeet.frontend
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/doc/source/parakeet.models.rst
+++ b/doc/source/parakeet.models.rst
@ -0,0 +1,45 @@
+parakeet.models package
+=======================
+
+Submodules
+----------
+
+parakeet.models.tacotron2 module
+--------------------------------
+
+.. automodule:: parakeet.models.tacotron2
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.models.transformer\_tts module
+---------------------------------------
+
+.. automodule:: parakeet.models.transformer_tts
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.models.waveflow module
+-------------------------------
+
+.. automodule:: parakeet.models.waveflow
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.models.wavenet module
+------------------------------
+
+.. automodule:: parakeet.models.wavenet
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: parakeet.models
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/doc/source/parakeet.modules.rst
+++ b/doc/source/parakeet.modules.rst
@ -0,0 +1,77 @@
+parakeet.modules package
+========================
+
+Submodules
+----------
+
+parakeet.modules.attention module
+---------------------------------
+
+.. automodule:: parakeet.modules.attention
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.modules.audio module
+-----------------------------
+
+.. automodule:: parakeet.modules.audio
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.modules.conv module
+----------------------------
+
+.. automodule:: parakeet.modules.conv
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.modules.geometry module
+--------------------------------
+
+.. automodule:: parakeet.modules.geometry
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.modules.losses module
+------------------------------
+
+.. automodule:: parakeet.modules.losses
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.modules.masking module
+-------------------------------
+
+.. automodule:: parakeet.modules.masking
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.modules.positional\_encoding module
+--------------------------------------------
+
+.. automodule:: parakeet.modules.positional_encoding
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.modules.transformer module
+-----------------------------------
+
+.. automodule:: parakeet.modules.transformer
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: parakeet.modules
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/doc/source/parakeet.rst
+++ b/doc/source/parakeet.rst
@ -0,0 +1,25 @@
+parakeet package
+================
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 4
+
+   parakeet.audio
+   parakeet.data
+   parakeet.datasets
+   parakeet.frontend
+   parakeet.models
+   parakeet.modules
+   parakeet.training
+   parakeet.utils
+
+Module contents
+---------------
+
+.. automodule:: parakeet
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/doc/source/parakeet.training.rst
+++ b/doc/source/parakeet.training.rst
@ -0,0 +1,37 @@
+parakeet.training package
+=========================
+
+Submodules
+----------
+
+parakeet.training.cli module
+----------------------------
+
+.. automodule:: parakeet.training.cli
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.training.default\_config module
+----------------------------------------
+
+.. automodule:: parakeet.training.default_config
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.training.experiment module
+-----------------------------------
+
+.. automodule:: parakeet.training.experiment
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: parakeet.training
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/doc/source/parakeet.utils.rst
+++ b/doc/source/parakeet.utils.rst
@ -0,0 +1,61 @@
+parakeet.utils package
+======================
+
+Submodules
+----------
+
+parakeet.utils.checkpoint module
+--------------------------------
+
+.. automodule:: parakeet.utils.checkpoint
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.utils.display module
+-----------------------------
+
+.. automodule:: parakeet.utils.display
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.utils.internals module
+-------------------------------
+
+.. automodule:: parakeet.utils.internals
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.utils.layer\_tools module
+----------------------------------
+
+.. automodule:: parakeet.utils.layer_tools
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.utils.mp\_tools module
+-------------------------------
+
+.. automodule:: parakeet.utils.mp_tools
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+parakeet.utils.scheduler module
+-------------------------------
+
+.. automodule:: parakeet.utils.scheduler
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: parakeet.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/docs/config_cn.md
+++ b/docs/config_cn.md
@ -0,0 +1,112 @@
+# 实验配置
+
+本节主要讲述 parakeet 的推荐的配置实验的方式，以及我们做出这样的选择的原因。
+
+## 配置选项的内容
+
+深度学习实验常常有很多选项可配置。这些配置大概可以被分为几类：
+
+1. 数据源以及数据处理方式配置；
+2. 实验结果保存路径配置；
+3. 数据预处理方式配置；
+4. 模型结构和超参数配置；
+5. 训练过程配置。
+
+虽然这些配置之间也可能存在某些重叠项，比如数据预处理部分的配置可能就和模型配置有关。比如说 mel 频谱的维数，既可以理解为模型配置的一部分，也可以理解为数据处理配置的一部分。但大体上，配置文件是可以分成几个部分的。
+
+## 常见配置文件格式
+
+常见的配置文件的格式有 `ini`, `yaml`, `toml`, `json` 等。
+
+`ini` 
+优点：简单，支持字符串插值等操作。
+缺点：仅支持两层结构，值不带类型信息，解析的时候需要手动 cast。
+
+`yaml`
+优点：格式简洁，值有类型，解析的时候一般不需手动 cast，支持写注释。
+缺点：语法规范复杂。
+
+`toml`
+和 yaml 类似
+
+`json`
+优点：格式简单，
+缺点：标记符号太多，可读性不佳，手写也容易出错。不支持注释。
+
+出于语言本身的表达能力和可读性，我们选择 yaml, 但我们会尽可能使配置文件简单。
+
+1. 类型上，只使用字符串，整数，浮点数，布尔值；
+2. 结构嵌套上，尽可能只使用两层或更浅的结构。
+
+## 配置选项和命令行参数处理
+
+对于深度学习实验，有部分配置是经常会发生改变的，比如数据源以及保存实验结果的路径，或者加载的 checkpoint 的路径等。对于这些配置，更好的做法是把它们实现为命令行参数。
+
+其余的不经常发生变动的参数，推荐将其写在配置文件中，我们推荐使用 `yaml` 作为配置文件，因为它允许添加注释，并且更加人类可读。
+
+当然把所有的选项都有 argparse 来处理也可以，但是对于选项丰富的深度学习实验来说，都使用 argparse 会导致代码异常冗长。
+
+但是需要注意的是，同时使用配置文件和命令行解析工具的时候，如果不做特殊处理，配置文件所支持的选项并不能显示在 argparse.ArgumentParser 的 usage 和 help 信息里。这主要是配置文件解析和 argparse 在设计上的一些固有的差异导致的。
+
+通过一些手段把配置所支持的选项附加到 ArgumentParser 固然可以弥补这点，但是这会存在一些默认值的优先级哪一方更高的问题，是默认配置的优先级更高，比如还是 ArgumentParser 中的默认值优先级更高。
+
+因此我们选择不把配置所支持的选项附加到 ArgumentParser，而是分开处理两部分。
+
+## 实践
+
+我们选择 yacs 搭配 argparse 作为配置解析工具，为 argparse 命令行新增一个选项 `--config` 来传入配置文件。yacs 有几个特点：
+
+1. 支持 yaml 格式的配置文件（亦即支持配置层级嵌套以及有类型的值）；
+2. 支持 config 的增量覆盖，以及由命令行参数覆盖配置文件等灵活的操作；
+3. 支持 `.key` 递归访问属性，比字典式的 `["key"]` 方便；
+
+我们推荐把默认的配置写成 python 代码（examples 中的每个例子都有一个 config.py，里面提供了默认的配置，并且带有注释）。而如果用户需要覆盖部分配置，则仅需要提供想要覆盖的部分配置即可，而不必提供一个完整的配置文件。这么做的考虑是：
+
+1. 仅提供需要覆盖的选项也是许多软件配置的标准方式。
+2. 对于同一个模型的两次实验，往往仅仅只有很少的配置发生变化，仅提供增量的配置比提供完整的配置更容易让用户看出两次实验的配置差异。
+3. 运行脚本的时候可以不传 `--config` 参数，而以默认配置运行实验，简化运行脚本。
+
+当新增实验的时候，可以参考 examples 里的例子来写默认配置文件。
+
+除了可以通过 `--config` 命令行参数来指定用于覆盖的配置文件。另外，我们还可以通过新增一个 `--opts` 选项来接收 ArgumentParser 解析到的剩余命令行参数。这些参数将被用于进一步覆盖配置。使用方式是 `--opts key1 value1 key2 value2 ...`，即以空格分割键和值，比如`--opts training.lr 0.001 model.encoder_layers 4`。其中的键是配置中的键名，对于嵌套的选项，其键名以 `.` 连接。
+
+## 默认的 ArgumentParser
+
+我们提供了默认的 ArgumentParser（参考 `parakeet/training/cli.py`）, 它实现了上述的功能。它包含极简的命令行选项，只有 `--config`, `--data`, `--output`, `--checkpoint_path`, `--device`, `--nprocs` 和 `--opts` 选项。
+
+这是一个深度学习基本都需要的一些命令行选项，因此当新增实验的时候，可以直接使用这个 ArgumentParser，当有超出这个范围的命令行选项时，也可以再继续新增。
+
+1. `--config` 和 `--opts` 用于支持配置文件解析，而配置文件本身处理了每个实验特有的选项；
+2. `--data` 和 `--output` 分别是数据集的路径和训练结果的保存路径（包含 checkpoints/ 文件夹，文本输出结果以及可视化输出结果）；
+3. `--checkpoint_path` 用于在训练前加载某个 checkpoint, 当需要从某个特定的 checkpoint 加载继续训练。另外，在不传 `--checkpoint_path` 的情况下，如果 `--output` 下的 checkpoints/ 文件夹中包含了训练的结果，则默认会加载其中最新的 checkpoint 继续训练。
+4. `--device` 和 `--nprocs` 指定了运行方式，`--device` 指定运行设备类型，是在 cpu 还是 gpu 上运行。`--nprocs` 指的是用多少个进程训练，如果 `nprocs` > 1 则意味着使用多进程并行训练。（注：目前只支持 gpu 多卡多进程训练。）
+
+使用帮助信息如下:
+
+```text
+usage: train.py [-h] [--config FILE] [--data DATA_DIR] [--output OUTPUT_DIR]
+                [--checkpoint_path CHECKPOINT_PATH] [--device {cpu,gpu}]
+                [--nprocs NPROCS] [--opts ...]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config FILE         path of the config file to overwrite to default config
+                        with.
+  --data DATA_DIR       path to the datatset.
+  --output OUTPUT_DIR   path to save checkpoint and log. If not provided, a
+                        directory is created in runs/ to save outputs.
+  --checkpoint_path CHECKPOINT_PATH
+                        path of the checkpoint to load
+  --device {cpu,gpu}    device type to use, cpu and gpu are supported.
+  --nprocs NPROCS       number of parallel processes to use.
+  --opts ...            options to overwrite --config file and the default
+                        config, passing in KEY VALUE pairs
+```
+
+
+
+
+
+
+
+
--- a/docs/experiment_cn.md
+++ b/docs/experiment_cn.md
@ -0,0 +1,75 @@
+# 实验流程
+
+实验中有不少细节需要注意，比如模型的保存和加载，定期进行验证，文本 log 和 可视化 log，保存配置文件等，另外对于不同的运行方式还有额外的处理，这些代码可能比较繁琐，但是对于追踪代码变化对结果的影响以及 debug 都非常重要。为了减少写这部分代码的成本，我们提供了不少通用的辅助代码，比如用于保存和加载，以及可视化的代码，可供实验代码直接使用。
+
+而对于整个实验过程，我们提供了一个 ExperimentBase 类，它是在模型和实验开发的过程抽象出来的训练过程模板，可以作为具体实验的基类使用。相比 chainer 中的 Trainer 以及 keras 中的 Model.fit 而言，ExperimentBase 是一个相对低层级的 API。它是作为基类来使用，用户仍然需要实现整个训练过程，也因此可以自由控制许多东西；而不是作为一种组合方式来使用，用户只需要提供模型，数据集，评价指标等就能自动完成整个训练过程。
+
+前者的方式并不能节省很多代码量，只是以一种标准化的方式来组织代码。后者的方式虽然能够节省许多代码量，但是把如何组成整个训练过程的方式对用户隐藏了。如果需要为标准的训练过程添加一些自定义行为，则必须通过 extension/hook 等方式来实现，在一些固定的时点加入一些自定义行为（比如 iteration 开始、结束时，epoch 开始、结束时，整个训练流程开始、结束时）。
+
+通过 extension/hook 之类的方式来为训练流程加入自定义行为，往往存在一些 access 的限制。extension/hook 一般是通过 callable 的形式来实现，但是这个 callable 可访问的变量往往是有限的，比如说只能访问 model, optimzier, dataloader, iteration, epoch, metric 等，如果需要访问其他的中间变量，则往往比较麻烦。
+
+此外，组合式的使用方式往往对几个组件之间传输数据的协议有一些预设。一个常见的预设是：dataloader 产生的 batch 即是 model 的输入。在简单的情况下，这样大抵是没有问题的，但是也存在一些可能，模型需要除了 batch 之外的输入。令一个常见的预设是：criterion 仅需要 model 的 input 和 output 就能计算 loss, 但这么做其实存在 overkill 的可能，某些情况下，不需要 input 和 output 的全部字段就能计算 loss，如果为了满足协议而把 criterion 的接口设计成一样的，存在输出不必要的参数的问题。
+
+## ExperimentBase 的设计
+
+因此我们选择了低层次的接口，用户仍然可以自由操作训练过程，而只是对训练过程做了粗粒度的抽象。可以参考 [ExperimentBase](parakeet/training/experiment.py) 的代码。
+
+继承 ExperimentBase 写作自己的实验类的时候，需要遵循一下的一些规范：
+
+1. 包含 `.model`, `.optimizer`, `.train_loader`, `.valid_loader`, `.config`, `.args` 等属性。
+2. 配置需要包含一个 `.training` 字段, 其中包含 `valid_interval`, `save_interval` 和 `max_iteration` 几个键. 它们被用作触发验证，保存 checkpoint 以及停止训练的条件。
+3. 需要实现四个方法 `train_batch`, `valid`, `setup_model` and `setup_dataloader`。`train_batch` 是在一个 batch 的过程，`valid` 是在整个验证数据集上执行一次验证的过程，`setup_model` 是初始化 model 和 optimizer 的过程，其他的模型构建相关的代码也可以放在这里，`setup_dataloader` 是 train_loader 和 valid_loader 的构建过程。
+
+实验的初始化过程如下, 包含了创建模型，优化器，数据迭代器，准备输出目录，logger 和可视化，保存配置的工作，除了 `setup_dataloader` 和 `self.setup_model` 需要自行实现，其他的几个方法都已有标准的实现。
+
+```python
+def __init__(self, config, args):
+    self.config = config
+    self.args = args
+
+def setup(self):
+    paddle.set_device(self.args.device)
+    if self.parallel:
+        self.init_parallel()
+
+    self.setup_output_dir()
+    self.dump_config()
+    self.setup_visualizer()
+    self.setup_logger()
+    self.setup_checkpointer()
+
+    self.setup_dataloader()
+    self.setup_model()
+
+    self.iteration = 0
+    self.epoch = 0
+```
+
+使用的时候只要一下的代码即可配置好一次实验：
+
+```python
+exp = Experiment(config, args)
+exp.setup()
+```
+
+整个训练流程可以表示如下:
+
+```python
+def train(self):
+    self.new_epoch()
+    while self.iteration < self.config.training.max_iteration:
+        self.iteration += 1
+        self.train_batch()
+
+        if self.iteration % self.config.training.valid_interval == 0:
+            self.valid()
+
+        if self.iteration % self.config.training.save_interval == 0:
+            self.save()
+```
+
+使用时只需要执行如下代码即可开始实验。
+
+```python
+exp.run()
+```
--- a/docs/experiment_guide_cn.md
+++ b/docs/experiment_guide_cn.md
@ -37,6 +37,10 @@ Dataset --(transform)--> Dataset  --+

 当开发新的模型的时候，开发这需要考虑拆分模块的可行性，以及模块的通用程度，把它们分置于合适的目录。

+## 配置实验
+
+我们使用 yacs 和 argparse 分别处理配置文件解析和命令行参数解析。关于配置的推荐方式，参考 [实验配置](./config_cn.md).
+
 ## 训练流程

 训练流程一般就是多次训练一个循环体。典型的循环体包含如下的过程：
@ -46,34 +50,27 @@ Dataset --(transform)--> Dataset  --+
 3. 神经网络的 forward/backward 计算；
 4. 参数更新；
 5. 符合一定条件时，在验证数据集上评估模型；
-6. 写日志，可视化，保存中间结果；
+6. 写日志，可视化，以及在某些情况下保存必要的中间结果；
 7. 保存模型和优化器的状态。

-`数据处理` 一节包含了 1 和 2, 模型和优化器包含了 3 和 4. 那么 5,6,7 是训练流程主要要完成的事情。为了使训练循环体简洁清晰，推荐将模型的保存和加载，模型评估，写日志以及可视化等功能都实现成函数，尽管很多情况下，它们可能需要访问很多局部变量。我们也正在考虑使用一个 Experiment 或者 Trainer 类来规范化这些训练循环体的写法。这样可以把一些需要被许多函数访问的变量作为类内的变量，可以使代码简洁而不至于引入太多的全局变量。
+`数据处理` 包含了数据集以及 batch_function 的定义, 模型和优化器包含了模型的 forward/backward 计算的定义。而在模型和数据都准备好了，我们需要把这些组织起来，完成实验代码。
+
+训练流程的组装，可以参考 [实验流程](./experiment_cn.md).
+
+## 实验模板

 实验代码一般以如下的方式组织：

 ```text
-├── configs/         (实验配置)
-├── data.py          (Dataset, DataLoader 等的定义)
 ├── README.md        (实验的帮助信息)
+├── config.py        (默认配置)
+├── preprocess.py    (数据预处理脚本)
+├── data.py          (Dataset, batch_function 等的定义)
 ├── synthesis.py     (用于生成的代码)
 ├── train.py         (用于训练的代码)
 └── utils.py         (其他必要的辅助函数)
 ```

-## 配置实验
-
-深度学习实验常常有很多选项可配置。这些配置大概可以被分为几类：
-
-1. 数据源以及数据处理方式配置；
-2. 实验结果保存路径配置；
-3. 数据预处理方式配置；
-4. 模型结构和超参数配置；
-5. 训练过程配置。
-
-这些配置之间也可能存在某些重叠项，比如数据预处理部分的配置可能就和模型配置有关。比如说 mel 频谱的维数。
-
-有部分配置是经常会发生改变的，比如数据源以及保存实验结果的路径，或者加载的 checkpoint 的路径等。对于这些配置，更好的做法是把它们实现为命令行参数。其余的不经常发生变动的参数，推荐将其写在配置文件中，我们推荐使用 `yaml` 作为配置文件，因为它允许添加注释，并且更加人类可读。
-
 在这个软件源中包含了几个例子，可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户，可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。
+
+
--- a/parakeet/init.py
+++ b/parakeet/init.py
@ -12,6 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-__version__ = "0.2.0"
+__version__ = "0.2.0-beta"

 from parakeet import audio, data, datasets, frontend, models, modules, training, utils
--- a/parakeet/main.py
+++ b/parakeet/main.py
@ -1,36 +0,0 @@
-import parakeet
-
-if __name__ == '__main__':
-    import argparse
-    import os
-    import shutil
-    from pathlib import Path
-    
-    package_path = Path(__file__).parent
-    print(package_path)
-
-    parser = argparse.ArgumentParser()
-    subparser = parser.add_subparsers(dest="cmd")
-    
-    list_exp_parser = subparser.add_parser("list-examples")
-    clone = subparser.add_parser("clone-example")
-    clone.add_argument("experiment_name", type=str, help="experiment name")
-    
-    args = parser.parse_args()
-    
-    if args.cmd == "list-examples":
-        print(os.listdir(package_path / "examples"))
-        exit(0)
-    
-    if args.cmd == "clone-example":
-        source = package_path / "examples" / (args.experiment_name)
-        target = Path(os.getcwd()) / (args.experiment_name)
-        if not os.path.exists(str(source)):
-            raise ValueError("{} does not exist".format(str(source)))
-        
-        if os.path.exists(str(target)):
-            raise FileExistsError("{} already exists".format(str(target)))
-        
-        shutil.copytree(str(source), str(target))
-        print("{} copied!".format(args.experiment_name))
-        exit(0)
--- a/parakeet/frontend/normalizer/normalizer.py
+++ b/parakeet/frontend/normalizer/normalizer.py
@ -19,6 +19,8 @@ from parakeet.frontend.normalizer.numbers import normalize_numbers


 def normalize(sentence):
+    """ Normalize English text.
+    """
    # preprocessing
    sentence = unicode(sentence)
    sentence = normalize_numbers(sentence)
--- a/parakeet/frontend/normalizer/numbers.py
+++ b/parakeet/frontend/normalizer/numbers.py
@ -75,6 +75,8 @@ def _expand_number(m):


 def normalize_numbers(text):
+    """ Normalize numbers in English text.
+    """
    text = re.sub(_comma_number_re, _remove_commas, text)
    text = re.sub(_pounds_re, r'\1 pounds', text)
    text = re.sub(_dollars_re, _expand_dollars, text)
--- a/parakeet/frontend/phonectic.py
+++ b/parakeet/frontend/phonectic.py
@ -39,6 +39,9 @@ class Phonetics(ABC):


 class English(Phonetics):
+    """ Normalize the input text sequence and convert into pronunciation id sequence.
+    """
+
    def __init__(self):
        self.backend = G2p()
        self.phonemes = list(self.backend.phonemes)
@ -46,6 +49,18 @@ class English(Phonetics):
        self.vocab = Vocab(self.phonemes + self.punctuations)

    def phoneticize(self, sentence):
+        """ Normalize the input text sequence and convert it into pronunciation sequence.
+
+        Parameters
+        -----------
+        sentence: str
+            The input text sequence.
+
+        Returns
+        ----------
+        List[str]
+            The list of pronunciation sequence.
+        """
        start = self.vocab.start_symbol
        end = self.vocab.end_symbol
        phonemes = ([] if start is None else [start]) \
@ -54,6 +69,18 @@ class English(Phonetics):
        return phonemes

    def numericalize(self, phonemes):
+        """ Convert pronunciation sequence into pronunciation id sequence.
+        
+        Parameters
+        -----------
+        phonemes: List[str]
+            The list of pronunciation sequence.
+
+        Returns
+        ----------
+        List[int]
+            The list of pronunciation id sequence.
+        """
        ids = [
            self.vocab.lookup(item) for item in phonemes
            if item in self.vocab.stoi
@ -61,17 +88,46 @@ class English(Phonetics):
        return ids

    def reverse(self, ids):
+        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
+        
+        Parameters
+        -----------
+        ids: List[int]
+            The list of pronunciation id sequence.
+
+        Returns
+        ----------
+        List[str]
+            The list of pronunciation sequence.
+        """
        return [self.vocab.reverse(i) for i in ids]

    def __call__(self, sentence):
+        """ Convert the input text sequence into pronunciation id sequence.
+
+        Parameters
+        -----------
+        sentence: str
+            The input text sequence.
+
+        Returns
+        ----------
+        List[str]
+            The list of pronunciation id sequence.
+        """
        return self.numericalize(self.phoneticize(sentence))

    @property
    def vocab_size(self):
+        """ Vocab size.
+        """
        return len(self.vocab)


 class EnglishCharacter(Phonetics):
+    """ Normalize the input text sequence and convert it into character id sequence.
+    """
+
    def __init__(self):
        self.backend = G2p()
        self.graphemes = list(self.backend.graphemes)
@ -79,10 +135,34 @@ class EnglishCharacter(Phonetics):
        self.vocab = Vocab(self.graphemes + self.punctuations)

    def phoneticize(self, sentence):
+        """ Normalize the input text sequence.
+
+        Parameters
+        -----------
+        sentence: str
+            The input text sequence.
+
+        Returns
+        ----------
+        str
+            A text sequence after normalize.
+        """
        words = normalize(sentence)
        return words

    def numericalize(self, sentence):
+        """ Convert a text sequence into ids.
+
+        Parameters
+        -----------
+        sentence: str
+            The input text sequence.
+
+        Returns
+        ----------
+        List[int]
+            List of a character id sequence.
+        """
        ids = [
            self.vocab.lookup(item) for item in sentence
            if item in self.vocab.stoi
@ -90,17 +170,46 @@ class EnglishCharacter(Phonetics):
        return ids

    def reverse(self, ids):
+        """ Convert a character id sequence into text.
+
+        Parameters
+        -----------
+        ids: List[int]
+            List of a character id sequence.
+
+        Returns
+        ----------
+        str
+            The input text sequence.
+        
+        """
        return [self.vocab.reverse(i) for i in ids]

    def __call__(self, sentence):
+        """ Normalize the input text sequence and convert it into character id sequence.
+        Parameters
+        -----------
+        sentence: str
+            The input text sequence.
+
+        Returns
+        ----------
+        List[int]
+            List of a character id sequence.
+        """
        return self.numericalize(self.phoneticize(sentence))

    @property
    def vocab_size(self):
+        """ Vocab size.
+        """
        return len(self.vocab)


 class Chinese(Phonetics):
+    """Normalize Chinese text sequence and convert it into ids.
+    """
+
    def __init__(self):
        self.opencc_backend = OpenCC('t2s.json')
        self.backend = G2pM()
@ -115,6 +224,18 @@ class Chinese(Phonetics):
        return list(all_syllables)

    def phoneticize(self, sentence):
+        """ Normalize the input text sequence and convert it into pronunciation sequence.
+
+        Parameters
+        -----------
+        sentence: str
+            The input text sequence.
+
+        Returns
+        ----------
+        List[str]
+            The list of pronunciation sequence.
+        """
        simplified = self.opencc_backend.convert(sentence)
        phonemes = self.backend(simplified)
        start = self.vocab.start_symbol
@ -136,15 +257,53 @@ class Chinese(Phonetics):
        return cleaned_phonemes

    def numericalize(self, phonemes):
+        """ Convert pronunciation sequence into pronunciation id sequence.
+        
+        Parameters
+        -----------
+        phonemes: List[str]
+            The list of pronunciation sequence.
+
+        Returns
+        ----------
+        List[int]
+            The list of pronunciation id sequence.
+        """
        ids = [self.vocab.lookup(item) for item in phonemes]
        return ids

    def __call__(self, sentence):
+        """ Convert the input text sequence into pronunciation id sequence.
+
+        Parameters
+        -----------
+        sentence: str
+            The input text sequence.
+
+        Returns
+        ----------
+        List[str]
+            The list of pronunciation id sequence.
+        """
        return self.numericalize(self.phoneticize(sentence))

    @property
    def vocab_size(self):
+        """ Vocab size.
+        """
        return len(self.vocab)

    def reverse(self, ids):
+        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
+        
+        Parameters
+        -----------
+        ids: List[int]
+            The list of pronunciation id sequence.
+
+        Returns
+        ----------
+        List[str]
+            The list of pronunciation sequence.
+        """
        return [self.vocab.reverse(i) for i in ids]
--- a/parakeet/frontend/vocab.py
+++ b/parakeet/frontend/vocab.py
@ -1,13 +1,46 @@
-from typing import Dict, Iterable, List
-from ruamel import yaml
-from collections import OrderedDict
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+from typing import Dict, Iterable, List
+from collections import OrderedDict

 __all__ = ["Vocab"]


 class Vocab(object):
-    def __init__(self, symbols: Iterable[str], 
+    """  Vocabulary.
+
+    Parameters
+    -----------
+    symbols: Iterable[str]
+        Common symbols.
+    
+    padding_symbol: str, optional
+        Symbol for pad. Defaults to "<pad>".
+
+    unk_symbol: str, optional
+        Symbol for unknow. Defaults to "<unk>"
+    
+    start_symbol: str, optional
+        Symbol for start. Defaults to "<s>"
+
+    end_symbol: str, optional
+        Symbol for end. Defaults to "</s>"
+    """
+
+    def __init__(self,
+                 symbols: Iterable[str],
                 padding_symbol="<pad>",
                 unk_symbol="<unk>",
                 start_symbol="<s>",
@ -23,7 +56,6 @@ class Vocab(object):
        self.start_symbol = start_symbol
        self.end_symbol = end_symbol

-        
        self.stoi = OrderedDict()
        self.stoi.update(self.special_symbols)

@ -37,23 +69,33 @@ class Vocab(object):

    @property
    def num_specials(self):
+        """ The number of special symbols.
+        """
        return len(self.special_symbols)

    # special tokens
    @property
    def padding_index(self):
+        """ The index of padding symbol
+        """
        return self.stoi.get(self.padding_symbol, -1)

    @property
    def unk_index(self):
+        """The index of unknow symbol.
+        """
        return self.stoi.get(self.unk_symbol, -1)

    @property
    def start_index(self):
+        """The index of start symbol.
+        """
        return self.stoi.get(self.start_symbol, -1)

    @property
    def end_index(self):
+        """ The index of end symbol.
+        """
        return self.stoi.get(self.end_symbol, -1)

    def __repr__(self):
@ -64,12 +106,18 @@ class Vocab(object):
        return self.__repr__()

    def lookup(self, symbol):
+        """ The index that symbol correspond.
+        """
        return self.stoi[symbol]

    def reverse(self, index):
+        """ The symbol thar index cottespond.
+        """
        return self.itos[index]

    def add_symbol(self, symbol):
+        """ Add a new symbol in vocab.
+        """
        if symbol in self.stoi:
            return
        N = len(self.stoi)
@ -77,6 +125,7 @@ class Vocab(object):
        self.itos[N] = symbol

    def add_symbols(self, symbols):
+        """ Add multiple symbols in vocab.
+        """
        for symbol in symbols:
            self.add_symbol(symbol)
-            
--- a/parakeet/models/init.py
+++ b/parakeet/models/init.py
@ -14,8 +14,9 @@

 #from parakeet.models.clarinet import *
 from parakeet.models.waveflow import *
-#from parakeet.models.wavenet import *
+from parakeet.models.wavenet import *

 from parakeet.models.transformer_tts import *
 #from parakeet.models.deepvoice3 import *
 # from parakeet.models.fastspeech import *
+from parakeet.models.tacotron2 import *
--- a/parakeet/models/tacotron2.py
+++ b/parakeet/models/tacotron2.py
@ -27,11 +27,29 @@ __all__ = ["Tacotron2", "Tacotron2Loss"]


 class DecoderPreNet(nn.Layer):
+    """Decoder prenet module for Tacotron2.
+
+    Parameters
+    ----------
+    d_input: int
+        The input feature size.
+
+    d_hidden: int
+        The hidden size.
+
+    d_output: int
+        The output feature size.
+
+    dropout_rate: float
+        The droput probability.
+
+    """
+
    def __init__(self,
                 d_input: int,
                 d_hidden: int,
                 d_output: int,
-                 dropout_rate: float=0.2):
+                 dropout_rate: float):
        super().__init__()

        self.dropout_rate = dropout_rate
@ -39,23 +57,59 @@ class DecoderPreNet(nn.Layer):
        self.linear2 = nn.Linear(d_hidden, d_output, bias_attr=False)

    def forward(self, x):
+        """Calculate forward propagation.
+
+        Parameters
+        ----------
+        x: Tensor [shape=(B, T_mel, C)]
+            Batch of the sequences of padded mel spectrogram.
+        
+        Returns
+        -------
+        output: Tensor [shape=(B, T_mel, C)]
+            Batch of the sequences of padded hidden state.
+
+        """
+
        x = F.dropout(F.relu(self.linear1(x)), self.dropout_rate)
        output = F.dropout(F.relu(self.linear2(x)), self.dropout_rate)
        return output


 class DecoderPostNet(nn.Layer):
+    """Decoder postnet module for Tacotron2.
+
+    Parameters
+    ----------
+    d_mels: int
+        The number of mel bands.
+
+    d_hidden: int
+        The hidden size of postnet.
+
+    kernel_size: int
+        The kernel size of the conv layer in postnet.
+
+    num_layers: int
+        The number of conv layers in postnet.
+
+    dropout: float
+        The droput probability.
+
+    """
+
    def __init__(self,
-                 d_mels: int=80,
-                 d_hidden: int=512,
-                 kernel_size: int=5,
-                 padding: int=0,
-                 num_layers: int=5,
-                 dropout: float=0.1):
+                 d_mels: int,
+                 d_hidden: int,
+                 kernel_size: int,
+                 num_layers: int,
+                 dropout: float):
        super().__init__()
        self.dropout = dropout
        self.num_layers = num_layers

+        padding = int((kernel_size - 1) / 2),
+
        self.conv_batchnorms = nn.LayerList()
        k = math.sqrt(1.0 / (d_mels * kernel_size))
        self.conv_batchnorms.append(
@ -91,15 +145,46 @@ class DecoderPostNet(nn.Layer):
                data_format='NLC'))

    def forward(self, input):
+        """Calculate forward propagation.
+
+        Parameters
+        ----------
+        input: Tensor [shape=(B, T_mel, C)]
+            Output sequence of features from decoder.
+        
+        Returns
+        -------
+        output: Tensor [shape=(B, T_mel, C)]
+            Output sequence of features after postnet.
+
+        """
+
        for i in range(len(self.conv_batchnorms) - 1):
            input = F.dropout(
                F.tanh(self.conv_batchnorms[i](input), self.dropout))
-        input = F.dropout(self.conv_batchnorms[self.num_layers - 1](input),
+        output = F.dropout(self.conv_batchnorms[self.num_layers - 1](input),
                           self.dropout)
-        return input
+        return output


 class Tacotron2Encoder(nn.Layer):
+    """Tacotron2 encoder module for Tacotron2.
+
+    Parameters
+    ----------
+    d_hidden: int
+        The hidden size in encoder module.
+    
+    conv_layers: int
+        The number of conv layers.
+
+    kernel_size: int
+        The kernel size of conv layers.
+    
+    p_dropout: float
+        The droput probability.
+    """
+
    def __init__(self,
                 d_hidden: int,
                 conv_layers: int,
@ -126,6 +211,22 @@ class Tacotron2Encoder(nn.Layer):
            d_hidden, self.hidden_size, direction="bidirectional")

    def forward(self, x, input_lens=None):
+        """Calculate forward propagation of tacotron2 encoder.
+
+        Parameters
+        ----------
+        x: Tensor [shape=(B, T)]
+            Batch of the sequencees of padded character ids.
+        
+        text_lens: Tensor [shape=(B,)], optional
+            Batch of lengths of each text input batch. Defaults to None.
+        
+        Returns
+        -------
+        output : Tensor [shape=(B, T, C)]
+            Batch of the sequences of padded hidden states.
+
+        """
        for conv_batchnorm in self.conv_batchnorms:
            x = F.dropout(F.relu(conv_batchnorm(x)),
                          self.p_dropout)  #(B, T, C)
@ -135,6 +236,47 @@ class Tacotron2Encoder(nn.Layer):


 class Tacotron2Decoder(nn.Layer):
+    """Tacotron2 decoder module for Tacotron2.
+
+    Parameters
+    ----------
+    d_mels: int
+        The number of mel bands.
+
+    reduction_factor: int
+        The reduction factor of tacotron.
+    
+    d_encoder: int
+        The hidden size of encoder.
+
+    d_prenet: int
+        The hidden size in decoder prenet.
+
+    d_attention_rnn: int
+        The attention rnn layer hidden size.
+
+    d_decoder_rnn: int
+        The decoder rnn layer hidden size.
+    
+    d_attention: int
+        The hidden size of the linear layer in location sensitive attention.
+
+    attention_filters: int
+        The filter size of the conv layer in location sensitive attention.
+            
+    attention_kernel_size: int
+        The kernel size of the conv layer in location sensitive attention.
+
+    p_prenet_dropout: float
+        The droput probability in decoder prenet.
+
+    p_attention_dropout: float
+        The droput probability in location sensitive attention.
+
+    p_decoder_dropout: float
+        The droput probability in decoder.
+    """
+
    def __init__(self,
                 d_mels: int,
                 reduction_factor: int,
@ -175,6 +317,8 @@ class Tacotron2Decoder(nn.Layer):
        self.stop_layer = nn.Linear(d_decoder_rnn + d_encoder, 1)

    def _initialize_decoder_states(self, key):
+        """init states be used in decoder
+        """
        batch_size = key.shape[0]
        MAX_TIME = key.shape[1]

@ -199,6 +343,8 @@ class Tacotron2Decoder(nn.Layer):
        self.processed_key = self.attention_layer.key_layer(key)  #[B, T, C]

    def _decode(self, query):
+        """decode one time step
+        """
        cell_input = paddle.concat([query, self.attention_context], axis=-1)

        # The first lstm layer
@ -232,6 +378,30 @@ class Tacotron2Decoder(nn.Layer):
        return decoder_output, stop_logit, self.attention_weights

    def forward(self, keys, querys, mask):
+        """Calculate forward propagation of tacotron2 decoder.
+
+        Parameters
+        ----------
+        keys: Tensor[shape=(B, T_key, C)]
+            Batch of the sequences of padded output from encoder.
+        
+        querys: Tensor[shape(B, T_query, C)]
+            Batch of the sequences of padded mel spectrogram.
+        
+        mask: Tensor
+            Mask generated with text length. Shape should be (B, T_key, T_query) or broadcastable shape.
+        
+        Returns
+        -------
+        mel_output: Tensor [shape=(B, T_query, C)]
+            Output sequence of features.
+
+        stop_logits: Tensor [shape=(B, T_query)]
+            Output sequence of stop logits.
+
+        alignments: Tensor [shape=(B, T_query, T_key)]
+            Attention weights.
+        """
        querys = paddle.reshape(
            querys,
            [querys.shape[0], querys.shape[1] // self.reduction_factor, -1])
@ -263,6 +433,31 @@ class Tacotron2Decoder(nn.Layer):
        return mel_outputs, stop_logits, alignments

    def infer(self, key, stop_threshold=0.5, max_decoder_steps=1000):
+        """Calculate forward propagation of tacotron2 decoder.
+
+        Parameters
+        ----------
+        keys: Tensor [shape=(B, T_key, C)]
+            Batch of the sequences of padded output from encoder.
+        
+        stop_threshold: float, optional
+            Stop synthesize when stop logit is greater than this stop threshold. Defaults to 0.5.
+        
+        max_decoder_steps: int, optional
+            Number of max step when synthesize. Defaults to 1000.
+        
+        Returns
+        -------
+        mel_output: Tensor [shape=(B, T_mel, C)]
+            Output sequence of features.
+
+        stop_logits: Tensor [shape=(B, T_mel)]
+            Output sequence of stop logits.
+
+        alignments: Tensor [shape=(B, T_mel, T_key)]
+            Attention weights.
+
+        """
        query = paddle.zeros(
            shape=[key.shape[0], self.d_mels * self.reduction_factor],
            dtype=key.dtype)  #[B, C]
@ -295,17 +490,76 @@ class Tacotron2Decoder(nn.Layer):


 class Tacotron2(nn.Layer):
-    """
-    Tacotron2 module for end-to-end text-to-speech (E2E-TTS).
+    """Tacotron2 model for end-to-end text-to-speech (E2E-TTS).

-    This is a module of Spectrogram prediction network in Tacotron2 described
-    in `Natural TTS Synthesis
-    by Conditioning WaveNet on Mel Spectrogram Predictions`_,
+    This is a model of Spectrogram prediction network in Tacotron2 described
+    in `Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions 
+    <https://arxiv.org/abs/1712.05884>`_,
    which converts the sequence of characters
    into the sequence of mel spectrogram.

-    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
-       https://arxiv.org/abs/1712.05884
+    Parameters
+    ----------
+    frontend : parakeet.frontend.Phonetics
+        Frontend used to preprocess text.
+
+    d_mels: int
+        Number of mel bands.
+    
+    d_encoder: int
+        Hidden size in encoder module.
+    
+    encoder_conv_layers: int
+        Number of conv layers in encoder.
+
+    encoder_kernel_size: int
+        Kernel size of conv layers in encoder.
+
+    d_prenet: int
+        Hidden size in decoder prenet.
+
+    d_attention_rnn: int
+        Attention rnn layer hidden size in decoder.
+
+    d_decoder_rnn: int
+        Decoder rnn layer hidden size in decoder.
+
+    attention_filters: int
+        Filter size of the conv layer in location sensitive attention.
+            
+    attention_kernel_size: int
+        Kernel size of the conv layer in location sensitive attention.
+
+    d_attention: int
+        Hidden size of the linear layer in location sensitive attention.
+
+    d_postnet: int
+        Hidden size of postnet.
+
+    postnet_kernel_size: int
+        Kernel size of the conv layer in postnet.
+
+    postnet_conv_layers: int
+        Number of conv layers in postnet.
+
+    reduction_factor: int
+        Reduction factor of tacotron2.
+
+    p_encoder_dropout: float
+        Droput probability in encoder.
+
+    p_prenet_dropout: float
+        Droput probability in decoder prenet.
+
+    p_attention_dropout: float
+        Droput probability in location sensitive attention.
+
+    p_decoder_dropout: float
+        Droput probability in decoder.
+
+    p_postnet_dropout: float
+        Droput probability in postnet.
+
    """

    def __init__(self,
@ -350,11 +604,38 @@ class Tacotron2(nn.Layer):
            d_mels=d_mels * reduction_factor,
            d_hidden=d_postnet,
            kernel_size=postnet_kernel_size,
-            padding=int((postnet_kernel_size - 1) / 2),
            num_layers=postnet_conv_layers,
            dropout=p_postnet_dropout)

    def forward(self, text_inputs, mels, text_lens, output_lens=None):
+        """Calculate forward propagation of tacotron2.
+
+        Parameters
+        ----------
+        text_inputs: Tensor [shape=(B, T_text)]
+            Batch of the sequencees of padded character ids.
+        
+        mels: Tensor [shape(B, T_mel, C)]
+            Batch of the sequences of padded mel spectrogram.
+        
+        text_lens: Tensor [shape=(B,)]
+            Batch of lengths of each text input batch.
+        
+        output_lens: Tensor [shape=(B,)], optional
+            Batch of lengths of each mels batch. Defaults to None.
+        
+        Returns
+        -------
+        outputs : Dict[str, Tensor]
+            
+            mel_output: output sequence of features (B, T_mel, C);
+
+            mel_outputs_postnet: output sequence of features after postnet (B, T_mel, C);
+
+            stop_logits: output sequence of stop logits (B, T_mel);
+
+            alignments: attention weights (B, T_mel, T_text).
+        """
        embedded_inputs = self.embedding(text_inputs)
        encoder_outputs = self.encoder(embedded_inputs, text_lens)

@ -386,6 +667,31 @@ class Tacotron2(nn.Layer):

    @paddle.no_grad()
    def infer(self, text_inputs, stop_threshold=0.5, max_decoder_steps=1000):
+        """Generate the mel sepctrogram of features given the sequences of character ids.
+
+        Parameters
+        ----------
+        text_inputs: Tensor [shape=(B, T_text)]
+            Batch of the sequencees of padded character ids.
+        
+        stop_threshold: float, optional
+            Stop synthesize when stop logit is greater than this stop threshold. Defaults to 0.5.
+        
+        max_decoder_steps: int, optional
+            Number of max step when synthesize. Defaults to 1000.
+        
+        Returns
+        -------
+        outputs : Dict[str, Tensor]
+
+            mel_output: output sequence of sepctrogram (B, T_mel, C);
+
+            mel_outputs_postnet: output sequence of sepctrogram after postnet (B, T_mel, C);
+
+            stop_logits: output sequence of stop logits (B, T_mel);
+
+            alignments: attention weights (B, T_mel, T_text).
+        """
        embedded_inputs = self.embedding(text_inputs)
        encoder_outputs = self.encoder(embedded_inputs)
        mel_outputs, stop_logits, alignments = self.decoder.infer(
@ -407,7 +713,27 @@ class Tacotron2(nn.Layer):

    @paddle.no_grad()
    def predict(self, text, stop_threshold=0.5, max_decoder_steps=1000):
-        # TODO(lifuchen): implement predict function to product mel from texts
+        """Generate the mel sepctrogram of features given the sequenc of characters.
+
+        Parameters
+        ----------
+        text: str
+            Sequence of characters.
+        
+        stop_threshold: float, optional
+            Stop synthesize when stop logit is greater than this stop threshold. Defaults to 0.5.
+        
+        max_decoder_steps: int, optional
+            Number of max step when synthesize. Defaults to 1000.
+        
+        Returns
+        -------
+        outputs : Dict[str, Tensor]
+
+            mel_outputs_postnet: output sequence of sepctrogram after postnet (T_mel, C);
+
+            alignments: attention weights (T_mel, T_text).
+        """
        ids = np.asarray(self.frontend(text))
        ids = paddle.unsqueeze(paddle.to_tensor(ids, dtype='int64'), [0])
        outputs = self.infer(ids, stop_threshold, max_decoder_steps)
@ -416,6 +742,24 @@ class Tacotron2(nn.Layer):

    @classmethod
    def from_pretrained(cls, frontend, config, checkpoint_path):
+        """Build a tacotron2 model from a pretrained model.
+
+        Parameters
+        ----------
+        frontend: parakeet.frontend.Phonetics
+            Frontend used to preprocess text.
+        
+        config: yacs.config.CfgNode
+            Model configs.
+        
+        checkpoint_path: Path or str
+            The path of pretrained model checkpoint, without extension name.
+        
+        Returns
+        -------
+        Tacotron2
+            The model build from pretrined result.
+        """
        model = cls(frontend,
                    d_mels=config.data.d_mels,
                    d_encoder=config.model.d_encoder,
@ -442,11 +786,45 @@ class Tacotron2(nn.Layer):


 class Tacotron2Loss(nn.Layer):
+    """ Tacotron2 Loss module
+    """
+
    def __init__(self):
        super().__init__()

    def forward(self, mel_outputs, mel_outputs_postnet, stop_logits,
                mel_targets, stop_tokens):
+        """Calculate tacotron2 loss.
+
+        Parameters
+        ----------
+        mel_outputs: Tensor [shape=(B, T_mel, C)]
+            Output mel spectrogram sequence.
+        
+        mel_outputs_postnet: Tensor [shape(B, T_mel, C)]
+            Output mel spectrogram sequence after postnet.
+        
+        stop_logits: Tensor [shape=(B, T_mel)]
+            Output sequence of stop logits befor sigmoid.
+        
+        mel_targets: Tensor [shape=(B, T_mel, C)]
+            Target mel spectrogram sequence.
+        
+        stop_tokens: Tensor [shape=(B,)]
+            Target stop token.
+        
+        Returns
+        -------
+        losses : Dict[str, Tensor]
+            
+            loss: the sum of the other three losses;
+
+            mel_loss: MSE loss compute by mel_targets and mel_outputs;
+
+            post_mel_loss: MSE loss compute by mel_targets and mel_outputs_postnet;
+
+            stop_loss: stop loss computed by stop_logits and stop token.
+        """
        mel_loss = paddle.nn.MSELoss()(mel_outputs, mel_targets)
        post_mel_loss = paddle.nn.MSELoss()(mel_outputs_postnet, mel_targets)
        stop_loss = paddle.nn.BCEWithLogitsLoss()(stop_logits, stop_tokens)
--- a/parakeet/models/transformer_tts.py
+++ b/parakeet/models/transformer_tts.py
@ -33,8 +33,7 @@ __all__ = ["TransformerTTS", "TransformerTTSLoss"]

 # Transformer TTS's own implementation of transformer
 class MultiheadAttention(nn.Layer):
-    """
-    Multihead scaled dot product attention with drop head. See 
+    """Multihead scaled dot product attention with drop head. See 
    [Scheduled DropHead: A Regularization Method for Transformer Models](https://arxiv.org/abs/2004.13342) 
    for details.
    
--- a/parakeet/models/wavenet.py
+++ b/parakeet/models/wavenet.py
@ -30,15 +30,26 @@ from parakeet.utils import checkpoint, layer_tools


 def crop(x, audio_start, audio_length):
-    """Crop the upsampled condition to match audio_length. The upsampled condition has the same time steps as the whole audio does. But since audios are sliced to 0.5 seconds randomly while conditions are not, upsampled conditions should also be sliced to extaclt match the time steps of the audio slice.
+    """Crop the upsampled condition to match audio_length. 
    
-    Args:
-        x (Tensor): shape(B, C, T), dtype float32, the upsample condition.
-        audio_start (Tensor): shape(B, ), dtype: int64, the index the starting point.
-        audio_length (int): the length of the audio (number of samples it contaions).
+    The upsampled condition has the same time steps as the whole audio does. 
+    But since audios are sliced to 0.5 seconds randomly while conditions are 
+    not, upsampled conditions should also be sliced to extactly match the time 
+    steps of the audio slice.

-    Returns:
-        Tensor: shape(B, C, audio_length), cropped condition.
+    Parameters
+    ----------
+    x : Tensor [shape=(B, C, T)]
+        The upsampled condition.
+    audio_start : Tensor [shape=(B,), dtype:int]
+        The index of the starting point of the audio clips.
+    audio_length : int
+        The length of the audio clip(number of samples it contaions).
+
+    Returns
+    -------
+    Tensor [shape=(B, C, audio_length)]
+        Cropped condition.
    """
    # crop audio
    slices = []  # for each example
@ -54,15 +65,37 @@ def crop(x, audio_start, audio_length):


 class UpsampleNet(nn.LayerList):
-    def __init__(self, upscale_factors=[16, 16]):
-        """UpsamplingNet.
-        It consists of several layers of Conv2DTranspose. Each Conv2DTranspose layer upsamples the time dimension by its `stride` times. And each Conv2DTranspose's filter_size at frequency dimension is 3.
+    """A network used to upsample mel spectrogram to match the time steps of 
+    audio.
    
-        Args:
-            upscale_factors (list[int], optional): time upsampling factors for each Conv2DTranspose Layer. The `UpsampleNet` contains len(upscale_factor) Conv2DTranspose Layers. Each upscale_factor is used as the `stride` for the corresponding Conv2DTranspose. Defaults to [16, 16].
-        Note:
-            np.prod(upscale_factors) should equals the `hop_length` of the stft transformation used to extract spectrogram features from audios. For example, 16 * 16 = 256, then the spectram extracted using a stft transformation whose `hop_length` is 256. See `librosa.stft` for more details.
+    It consists of several layers of Conv2DTranspose. Each Conv2DTranspose 
+    layer upsamples the time dimension by its `stride` times. 
+    
+    Also, each Conv2DTranspose's filter_size at frequency dimension is 3.
+
+    Parameters
+    ----------
+    upscale_factors : List[int], optional
+        Time upsampling factors for each Conv2DTranspose Layer. 
+        
+        The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose 
+        Layers. Each upscale_factor is used as the ``stride`` for the 
+        corresponding Conv2DTranspose. Defaults to [16, 16], this the default 
+        upsampling factor is 256.
+        
+    Notes
+    ------
+    ``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft 
+    transformation used to extract spectrogram features from audio. 
+    
+    For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft 
+    transformation whose ``hop_length`` equals 256 is suitable. 
+        
+    See Also
+    ---------
+    ``librosa.core.stft``
    """
+    def __init__(self, upscale_factors=[16, 16]):
        super(UpsampleNet, self).__init__()
        self.upscale_factors = list(upscale_factors)
        self.upscale_factor = 1
@ -78,13 +111,20 @@ class UpsampleNet(nn.LayerList):
                        padding=(1, factor // 2))))

    def forward(self, x):
-        """Compute the upsampled condition.
+        r"""Compute the upsampled condition.

-        Args:
-            x (Tensor): shape(B, F, T), dtype float32, the condition (mel spectrogram here.) (F means the frequency bands). In the internal Conv2DTransposes, the frequency dimension is treated as `height` dimension instead of `in_channels`.
+        Parameters
+        -----------
+        x : Tensor [shape=(B, F, T)]
+            The condition (mel spectrogram here). ``F`` means the frequency 
+            bands, which is the feature size of the input. 
+            
+            In the internal Conv2DTransposes, the frequency dimension 
+            is treated as ``height`` dimension instead of ``in_channels``.

        Returns:
-            Tensor: shape(B, F, T * upscale_factor), dtype float32, the upsampled condition.
+            Tensor [shape=(B, F, T \* upscale_factor)]
+                The upsampled condition.
        """
        x = paddle.unsqueeze(x, 1)
        for sublayer in self:
@ -94,19 +134,36 @@ class UpsampleNet(nn.LayerList):


 class ResidualBlock(nn.Layer):
+    """A Residual block used in wavenet. Conv1D-gated-tanh Block.
+        
+    It consists of a Conv1DCell and an Conv1D(kernel_size = 1) to integrate 
+    information of the condition.
+    
+    Notes
+    --------
+    It does not have parametric residual or skip connection. 
+
+    Parameters
+    -----------
+    residual_channels : int
+        The feature size of the input. It is also the feature size of the 
+        residual output and skip output.
+        
+    condition_dim : int
+        The feature size of the condition.
+        
+    filter_size : int
+        Kernel size of the internal convolution cells.
+        
+    dilation :int
+        Dilation of the internal convolution cells.
+    """
    def __init__(self, 
                 residual_channels: int, 
                 condition_dim: int, 
                 filter_size: Union[int, Sequence[int]],
                 dilation: int):
-        """A Residual block in wavenet. It does not have parametric residual or skip connection. It consists of a Conv1DCell and an Conv1D(filter_size = 1) to integrate the condition.
        
-        Args:
-            residual_channels (int): the channels of the input, residual and skip.
-            condition_dim (int): the channels of the condition.
-            filter_size (int): filter size of the internal convolution cell.
-            dilation (int): dilation of the internal convolution cell.
-        """
        super(ResidualBlock, self).__init__()
        dilated_channels = 2 * residual_channels
        # following clarinet's implementation, we do not have parametric residual
@ -133,16 +190,28 @@ class ResidualBlock(nn.Layer):
        self.condition_dim = condition_dim

    def forward(self, x, condition=None):
-        """Conv1D gated-tanh Block.
+        """Forward pass of the ResidualBlock.

-        Args:
-            x (Tensor): shape(B, C_res, T), the input. (B stands for batch_size, C_res stands for residual channels, T stands for time steps.) dtype float32.
-            condition (Tensor, optional): shape(B, C_cond, T), the condition, it has been upsampled in time steps, so it has the same time steps as the input does.(C_cond stands for the condition's channels). Defaults to None.
+        Parameters
+        -----------
+        x : Tensor [shape=(B, C, T)]
+            The input tensor.
             
-        Returns:
-            (residual, skip_connection)
-            residual (Tensor): shape(B, C_res, T), the residual, which is used as the input to the next layer of ResidualBlock.
-            skip_connection (Tensor): shape(B, C_res, T), the skip connection. This output is accumulated with that of other ResidualBlocks. 
+        condition : Tensor, optional [shape(B, C_cond, T)]
+            The condition. 
+            
+            It has been upsampled in time steps, so it has the same time steps 
+            as the input does.(C_cond stands for the condition's channels). 
+            Defaults to None.
+
+        Returns
+        -----------
+        residual : Tensor [shape=(B, C, T)]
+            The residual, which is used as the input to the next ResidualBlock.
+            
+        skip_connection : Tensor [shape=(B, C, T)]
+            Tthe skip connection. This output is accumulated with that of 
+            other ResidualBlocks. 
    """
        h = x

@ -163,22 +232,38 @@ class ResidualBlock(nn.Layer):
        return residual, skip_connection

    def start_sequence(self):
-        """Prepare the ResidualBlock to generate a new sequence. This method should be called before starting calling `add_input` multiple times.
+        """Prepare the ResidualBlock to generate a new sequence. 
+        
+        Warnings
+        ---------
+        This method should be called before calling ``add_input`` multiple times.
        """
        self.conv.start_sequence()
        self.condition_proj.start_sequence()

    def add_input(self, x, condition=None):
-        """Add a step input. This method works similarily with `forward` but in a `step-in-step-out` fashion.
+        """Take a step input and return a step output. 
        
-        Args:
-            x (Tensor): shape(B, C_res), input for a step, dtype float32.
-            condition (Tensor, optional): shape(B, C_cond). condition for a step, dtype float32. Defaults to None.
+        This method works similarily with ``forward`` but in a 
+        ``step-in-step-out`` fashion.

-        Returns:
-            (residual, skip_connection)
-            residual (Tensor): shape(B, C_res), the residual for a step, which is used as the input to the next layer of ResidualBlock.
-            skip_connection (Tensor): shape(B, C_res), the skip connection for a step. This output is accumulated with that of other ResidualBlocks. 
+        Parameters
+        ----------
+        x : Tensor [shape=(B, C)]
+            Input for a step.
+            
+        condition : Tensor, optional [shape=(B, C_cond)]
+            Condition for a step. Defaults to None.
+
+        Returns
+        ----------
+        residual : Tensor [shape=(B, C)] 
+            The residual for a step, which is used as the input to the next 
+            layer of ResidualBlock.
+            
+        skip_connection : Tensor [shape=(B, C)]
+            T he skip connection for a step. This output is accumulated with 
+            that of other ResidualBlocks. 
        """
        h = x

@ -511,6 +596,54 @@ class WaveNet(nn.Layer):


 class ConditionalWaveNet(nn.Layer):
+    r"""Conditional Wavenet. An implementation of 
+    `WaveNet: A Generative Model for Raw Audio <http://arxiv.org/abs/1609.03499>`_.
+    
+    It contains an UpsampleNet as the encoder and a WaveNet as the decoder. 
+    It is an autoregressive model that generate raw audio.
+
+    Parameters
+    ----------
+    upsample_factors : List[int]
+        The upsampling factors of the UpsampleNet.
+        
+    n_stack : int
+        Number of convolution stacks in the WaveNet. 
+        
+    n_loop : int
+        Number of convolution layers in a convolution stack.
+        
+        Convolution layers in a stack have exponentially growing dilations, 
+        from 1 to .. math:: `k^{n_{loop} - 1}`, where k is the kernel size.
+        
+    residual_channels : int
+        Feature size of each ResidualBlocks.
+        
+    output_dim : int
+        Feature size of the output. See ``loss_type`` for details.
+        
+    n_mels : int
+        The number of bands of mel spectrogram.
+        
+    filter_size : int, optional
+        Convolution kernel size of each ResidualBlock, by default 2.
+        
+    loss_type : str, optional ["mog" or "softmax"]
+        The output type and loss type of the model, by default "mog".
+        
+        If "softmax", the model input should be quantized audio and the model 
+        outputs a discret distribution.
+        
+        If "mog", the model input is audio in floating point format, and the 
+        model outputs parameters for a mixture of gaussian distributions. 
+        Namely, the weight, mean and logscale of each gaussian distribution. 
+        Thus, the ``output_size`` should be a multiple of 3.
+        
+    log_scale_min : float, optional
+        Minimum value of the log probability density, by default -9.0.
+        
+        This is only used for computing loss when ``loss_type`` is "mog", If the 
+    """
    def __init__(self, 
                 upsample_factors: List[int], 
                 n_stack: int, 
@ -521,8 +654,6 @@ class ConditionalWaveNet(nn.Layer):
                 filter_size: int=2, 
                 loss_type: str="mog", 
                 log_scale_min: float=-9.0):
-        """Conditional Wavenet, which contains an UpsampleNet as the encoder and a WaveNet as the decoder. It is an autoregressive model.
-        """
        super(ConditionalWaveNet, self).__init__()
        self.encoder = UpsampleNet(upsample_factors)
        self.decoder = WaveNet(n_stack=n_stack, 
@ -537,13 +668,23 @@ class ConditionalWaveNet(nn.Layer):
    def forward(self, audio, mel, audio_start):
        """Compute the output distribution given the mel spectrogram and the input(for teacher force training).

-        Args:
-            audio (Tensor): shape(B, T_audio), dtype float32, ground truth waveform, used for teacher force training.
-            mel (Tensor): shape(B, F, T_mel), dtype float32, mel spectrogram. Note that it is the spectrogram for the whole utterance.
-            audio_start (Tensor): shape(B, ), dtype: int, audio slices' start positions for each utterance.
+        Parameters
+        -----------
+        audio : Tensor [shape=(B, T_audio)]
+            ground truth waveform, used for teacher force training.
            
-        Returns:
-            Tensor: shape(B, T_audio - 1, C_putput), parameters for the output distribution.(C_output is the `output_dim` of the decoder.)
+        mel : Tensor [shape(B, F, T_mel)]
+            Mel spectrogram. Note that it is the spectrogram for the whole 
+            utterance.
+            
+        audio_start : Tensor [shape=(B,), dtype: int]
+            Audio slices' start positions for each utterance.
+
+        Returns
+        ----------
+        Tensor [shape(B, T_audio - 1, C_output)]
+            Parameters for the output distribution, where ``C_output`` is the 
+            ``output_dim`` of the decoder.)
        """
        audio_length = audio.shape[1]  # audio clip's length
        condition = self.encoder(mel)
@ -557,14 +698,21 @@ class ConditionalWaveNet(nn.Layer):
        return y

    def loss(self, y, t):
-        """compute loss with respect to the output distribution and the targer audio.
+        """Compute loss with respect to the output distribution and the target 
+        audio.

-        Args:
-            y (Tensor): shape(B, T - 1, C_output), dtype float32, parameters of the output distribution.
-            t (Tensor): shape(B, T), dtype float32, target waveform.
+        Parameters
+        -----------
+        y : Tensor [shape=(B, T - 1, C_output)]
+            Parameters of the output distribution.
            
-        Returns:
-            Tensor: shape(1, ), dtype float32, the loss.
+        t : Tensor [shape(B, T)] 
+            target waveform.
+
+        Returns
+        --------
+        Tensor: [shape=(1,)]
+            the loss.
        """
        t = t[:, 1:]
        loss = self.decoder.loss(y, t)
@ -573,24 +721,35 @@ class ConditionalWaveNet(nn.Layer):
    def sample(self, y):
        """Sample from the output distribution.

-        Args:
-            y (Tensor): shape(B, T, C_output), dtype float32, parameters of the output distribution.
+        Parameters
+        -----------
+        y : Tensor [shape=(B, T, C_output)]
+            Parameters of the output distribution.

-        Returns:
-            Tensor: shape(B, T), dtype float32, sampled waveform from the output distribution.
+        Returns
+        --------
+        Tensor [shape=(B, T)] 
+            Sampled waveform from the output distribution.
        """
        samples = self.decoder.sample(y)
        return samples

    @paddle.no_grad()
    def infer(self, mel):
-        """Synthesize waveform from mel spectrogram.
+        r"""Synthesize waveform from mel spectrogram.

-        Args:
-            mel (Tensor): shape(B, F, T), condition(mel spectrogram here).
+        Parameters
+        -----------
+        mel : Tensor [shape=(B, F, T)] 
+            The ondition (mel spectrogram here).

-        Returns:
-            Tensor: shape(B, T * upsacle_factor), synthesized waveform.(`upscale_factor` is the `upscale_factor` of the encoder `UpsampleNet`)
+        Returns
+        -----------
+        Tensor [shape=(B, T \* upsacle_factor)]
+            Synthesized waveform.
+            
+            ``upscale_factor`` is the ``upscale_factor`` of the encoder 
+            ``UpsampleNet``.
        """
        condition = self.encoder(mel)
        batch_size, _, time_steps = condition.shape
@ -610,6 +769,20 @@ class ConditionalWaveNet(nn.Layer):

    @paddle.no_grad()
    def predict(self, mel):
+        r"""Synthesize audio from mel spectrogram. 
+        
+        The output and input are numpy arrays without batch.
+
+        Parameters
+        ----------
+        mel : np.ndarray [shape=(C, T)]
+            Mel spectrogram of an utterance.
+
+        Returns
+        -------
+        Tensor : np.ndarray [shape=(C, T \* upsample_factor)]
+            The synthesized waveform of an utterance.
+        """
        mel = paddle.to_tensor(mel)
        mel = paddle.unsqueeze(mel, 0)
        audio = self.infer(mel)
@ -618,6 +791,21 @@ class ConditionalWaveNet(nn.Layer):

    @classmethod
    def from_pretrained(cls, config, checkpoint_path):
+        """Build a ConditionalWaveNet model from a pretrained model.
+
+        Parameters
+        ----------        
+        config: yacs.config.CfgNode
+            model configs
+        
+        checkpoint_path: Path or str
+            the path of pretrained model checkpoint, without extension name
+        
+        Returns
+        -------
+        ConditionalWaveNet
+            The model built from pretrained result.
+        """
        model = cls(
            upsample_factors=config.model.upsample_factors,
            n_stack=config.model.n_stack, 
@ -631,5 +819,3 @@ class ConditionalWaveNet(nn.Layer):
        layer_tools.summary(model)
        checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
        return model
-
-    
--- a/parakeet/modules/init.py
+++ b/parakeet/modules/init.py
@ -12,3 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from parakeet.modules.attention import *
+from parakeet.modules.audio import *
+from parakeet.modules.conv import *
+from parakeet.modules.geometry import *
+from parakeet.modules.losses import *
+from parakeet.modules.masking import *
+from parakeet.modules.positional_encoding import *
+from parakeet.modules.transformer import *
--- a/parakeet/modules/attention.py
+++ b/parakeet/modules/attention.py
@ -25,22 +25,34 @@ def scaled_dot_product_attention(q,
                                 mask=None,
                                 dropout=0.0,
                                 training=True):
-    """
-    scaled dot product attention with mask. Assume q, k, v all have the same 
-    leader dimensions(denoted as * in descriptions below). Dropout is applied to 
-    attention weights before weighted sum of values.
+    r"""Scaled dot product attention with masking. 
    
-    Args:
-        q (Tensor): shape(*, T_q, d), the query tensor.
-        k (Tensor): shape(*, T_k, d), the key tensor.
-        v (Tensor): shape(*, T_k, d_v), the value tensor.
-        mask (Tensor, optional): shape(*, T_q, T_k) or broadcastable shape, the 
-            mask tensor, 0 correspond to padding. Defaults to None.
+    Assume that q, k, v all have the same leading dimensions (denoted as * in 
+    descriptions below). Dropout is applied to attention weights before 
+    weighted sum of values.

-    Returns:
-        (out, attn_weights)
-        out (Tensor): shape(*, T_q, d_v), the context vector.
-        attn_weights (Tensor): shape(*, T_q, T_k), the attention weights.
+    Parameters
+    -----------
+    
+    q : Tensor [shape=(\*, T_q, d)]
+        the query tensor.
+        
+    k : Tensor [shape=(\*, T_k, d)]
+        the key tensor.
+        
+    v : Tensor [shape=(\*, T_k, d_v)]
+        the value tensor.
+        
+    mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
+        the mask tensor, zeros correspond to paddings. Defaults to None.
+    
+    Returns
+    ----------
+    out : Tensor [shape=(\*, T_q, d_v)] 
+        the context vector.
+
+    attn_weights : Tensor [shape=(\*, T_q, T_k)]
+        the attention weights.
    """
    d = q.shape[-1]  # we only support imperative execution
    qk = paddle.matmul(q, k, transpose_y=True)
@ -55,17 +67,25 @@ def scaled_dot_product_attention(q,
    return out, attn_weights


-def drop_head(x, drop_n_heads, training):
-    """
-    Drop n heads from multiple context vectors.
+def drop_head(x, drop_n_heads, training=True):
+    """Drop n context vectors from multiple ones.

-    Args:
-        x (Tensor): shape(batch_size, num_heads, time_steps, channels), the input.
-        drop_n_heads (int): [description]
-        training ([type]): [description]
+    Parameters
+    ----------
+    x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]
+        The input, multiple context vectors.
        
-    Returns:
-        [type]: [description]
+    drop_n_heads : int [0<= drop_n_heads <= num_heads]
+        Number of vectors to drop.
+        
+    training : bool
+        A flag indicating whether it is in training. If `False`, no dropout is 
+        applied.
+
+    Returns
+    -------
+    Tensor
+        The output.
    """
    if not training or (drop_n_heads == 0):
        return x
@ -101,21 +121,31 @@ def _concat_heads(x):

 # Standard implementations of Monohead Attention & Multihead Attention
 class MonoheadAttention(nn.Layer):
-    def __init__(self, model_dim, dropout=0.0, k_dim=None, v_dim=None):
-        """
-        Monohead Attention module.
+    """Monohead Attention module.

-        Args:
-            model_dim (int): the feature size of query.
-            dropout (float, optional): dropout probability of scaled dot product
-                attention and final context vector. Defaults to 0.0.
-            k_dim (int, optional): feature size of the key of each scaled dot 
-                product attention. If not provided, it is set to 
-                model_dim / num_heads. Defaults to None.
-            v_dim (int, optional): feature size of the key of each scaled dot 
-                product attention. If not provided, it is set to 
-                model_dim / num_heads. Defaults to None.
+    Parameters
+    ----------
+    model_dim : int
+        Feature size of the query.
+        
+    dropout : float, optional
+        Dropout probability of scaled dot product attention and final context 
+        vector. Defaults to 0.0.
+        
+    k_dim : int, optional
+        Feature size of the key of each scaled dot product attention. If not 
+        provided, it is set to `model_dim / num_heads`. Defaults to None.
+        
+    v_dim : int, optional
+        Feature size of the key of each scaled dot product attention. If not 
+        provided, it is set to `model_dim / num_heads`. Defaults to None.
    """
+
+    def __init__(self,
+                 model_dim: int,
+                 dropout: float=0.0,
+                 k_dim: int=None,
+                 v_dim: int=None):
        super(MonoheadAttention, self).__init__()
        k_dim = k_dim or model_dim
        v_dim = v_dim or model_dim
@ -128,20 +158,29 @@ class MonoheadAttention(nn.Layer):
        self.dropout = dropout

    def forward(self, q, k, v, mask):
-        """
-        Compute context vector and attention weights.
+        """Compute context vector and attention weights.
        
-        Args:
-            q (Tensor): shape(batch_size, time_steps_q, model_dim), the queries.
-            k (Tensor): shape(batch_size, time_steps_k, model_dim), the keys.
-            v (Tensor): shape(batch_size, time_steps_k, model_dim), the values.
-            mask (Tensor): shape(batch_size, times_steps_q, time_steps_k) or 
-                broadcastable shape, dtype: float32 or float64, the mask.
+        Parameters
+        -----------
+        q : Tensor [shape=(batch_size, time_steps_q, model_dim)] 
+            The queries.
            
-        Returns:
-            (out, attention_weights)
-            out (Tensor), shape(batch_size, time_steps_q, model_dim), the context vector.
-            attention_weights (Tensor): shape(batch_size, times_steps_q, time_steps_k), the attention weights.
+        k : Tensor [shape=(batch_size, time_steps_k, model_dim)] 
+            The keys.
+            
+        v : Tensor [shape=(batch_size, time_steps_k, model_dim)] 
+            The values.
+            
+        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
+            The mask.
+
+        Returns
+        ----------
+        out : Tensor [shape=(batch_size, time_steps_q, model_dim)] 
+            The context vector.
+            
+        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
+            The attention weights.
        """
        q = self.affine_q(q)  # (B, T, C)
        k = self.affine_k(k)
@ -155,34 +194,40 @@ class MonoheadAttention(nn.Layer):


 class MultiheadAttention(nn.Layer):
-    """
-    Multihead scaled dot product attention.
+    """Multihead Attention module.
+
+    Parameters
+    -----------
+    model_dim: int
+        The feature size of query.
+        
+    num_heads : int
+        The number of attention heads.
+        
+    dropout : float, optional
+        Dropout probability of scaled dot product attention and final context 
+        vector. Defaults to 0.0.
+        
+    k_dim : int, optional
+        Feature size of the key of each scaled dot product attention. If not 
+        provided, it is set to ``model_dim / num_heads``. Defaults to None.
+        
+    v_dim : int, optional
+        Feature size of the key of each scaled dot product attention. If not 
+        provided, it is set to ``model_dim / num_heads``. Defaults to None.
+
+    Raises
+    ---------
+    ValueError
+        If ``model_dim`` is not divisible by ``num_heads``.
    """

    def __init__(self,
-                 model_dim,
-                 num_heads,
-                 dropout=0.0,
-                 k_dim=None,
-                 v_dim=None):
-        """
-        Multihead Attention module.
-
-        Args:
-            model_dim (int): the feature size of query.
-            num_heads (int): the number of attention heads.
-            dropout (float, optional): dropout probability of scaled dot product
-                attention and final context vector. Defaults to 0.0.
-            k_dim (int, optional): feature size of the key of each scaled dot 
-                product attention. If not provided, it is set to 
-                model_dim / num_heads. Defaults to None.
-            v_dim (int, optional): feature size of the key of each scaled dot 
-                product attention. If not provided, it is set to 
-                model_dim / num_heads. Defaults to None.
-
-        Raises:
-            ValueError: if model_dim is not divisible by num_heads
-        """
+                 model_dim: int,
+                 num_heads: int,
+                 dropout: float=0.0,
+                 k_dim: int=None,
+                 v_dim: int=None):
        super(MultiheadAttention, self).__init__()
        if model_dim % num_heads != 0:
            raise ValueError("model_dim must be divisible by num_heads")
@ -199,20 +244,29 @@ class MultiheadAttention(nn.Layer):
        self.dropout = dropout

    def forward(self, q, k, v, mask):
-        """
-        Compute context vector and attention weights.
+        """Compute context vector and attention weights.
        
-        Args:
-            q (Tensor): shape(batch_size, time_steps_q, model_dim), the queries.
-            k (Tensor): shape(batch_size, time_steps_k, model_dim), the keys.
-            v (Tensor): shape(batch_size, time_steps_k, model_dim), the values.
-            mask (Tensor): shape(batch_size, times_steps_q, time_steps_k) or 
-                broadcastable shape, dtype: float32 or float64, the mask.
+        Parameters
+        -----------
+        q : Tensor [shape=(batch_size, time_steps_q, model_dim)] 
+            The queries.
            
-        Returns:
-            (out, attention_weights)
-            out (Tensor), shape(batch_size, time_steps_q, model_dim), the context vector.
-            attention_weights (Tensor): shape(batch_size, times_steps_q, time_steps_k), the attention weights.
+        k : Tensor [shape=(batch_size, time_steps_k, model_dim)] 
+            The keys.
+            
+        v : Tensor [shape=(batch_size, time_steps_k, model_dim)] 
+            The values.
+            
+        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
+            The mask.
+
+        Returns
+        ----------
+        out : Tensor [shape=(batch_size, time_steps_q, model_dim)] 
+            The context vector.
+            
+        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
+            The attention weights.
        """
        q = _split_heads(self.affine_q(q), self.num_heads)  # (B, h, T, C)
        k = _split_heads(self.affine_k(k), self.num_heads)
@ -228,6 +282,28 @@ class MultiheadAttention(nn.Layer):


 class LocationSensitiveAttention(nn.Layer):
+    """Location Sensitive Attention module.
+
+    Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
+
+    Parameters
+    -----------
+    d_query: int
+        The feature size of query.
+        
+    d_key : int
+        The feature size of key.
+        
+    d_attention : int
+        The feature size of dimension. 
+        
+    location_filters : int
+        Filter size of attention convolution.
+        
+    location_kernel_size : int
+        Kernel size of attention convolution.
+    """
+
    def __init__(self,
                 d_query: int,
                 d_key: int,
@ -259,6 +335,34 @@ class LocationSensitiveAttention(nn.Layer):
                value,
                attention_weights_cat,
                mask=None):
+        """Compute context vector and attention weights.
+        
+        Parameters
+        -----------
+        query : Tensor [shape=(batch_size, d_query)] 
+            The queries.
+            
+        processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)] 
+            The keys after linear layer.
+            
+        value : Tensor [shape=(batch_size, time_steps_k, d_key)] 
+            The values.
+
+        attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
+            Attention weights concat.
+            
+        mask : Tensor, optional
+            The mask. Shape should be (batch_size, times_steps_q, time_steps_k) or broadcastable shape.
+            Defaults to None.
+
+        Returns
+        ----------
+        attention_context : Tensor [shape=(batch_size, time_steps_q, d_attention)] 
+            The context vector.
+            
+        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
+            The attention weights.
+        """

        processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
        processed_attention_weights = self.location_layer(
--- a/parakeet/modules/audio.py
+++ b/parakeet/modules/audio.py
@ -8,28 +8,48 @@ __all__ = ["quantize", "dequantize", "STFT"]


 def quantize(values, n_bands):
-    """Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in [0, n_bands).
+    """Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in 
+    [0, n_bands).

-    Args:
-        values (Tensor): dtype: flaot32 or float64. the floating point value.
-        n_bands (int): the number of bands. The output integer Tensor's value is in the range [0, n_bans).
+    Parameters
+    -----------
+    values : Tensor [dtype: flaot32 or float64]
+        The floating point value.
        
-    Returns:
-        Tensor: the quantized tensor, dtype: int64.
+    n_bands : int
+        The number of bands. The output integer Tensor's value is in the range 
+        [0, n_bans).
+
+    Returns
+    ----------
+    Tensor [dtype: int 64]
+        The quantized tensor.
    """
    quantized = paddle.cast((values + 1.0) / 2.0 * n_bands, "int64")
    return quantized


 def dequantize(quantized, n_bands, dtype=None):
-    """Linearlly dequantize an integer Tensor into a float Tensor in the range [-1, 1).
+    """Linearlly dequantize an integer Tensor into a float Tensor in the range 
+    [-1, 1).

-    Args:
-        quantized (Tensor): dtype: int64. The quantized value in the range [0, n_bands).
-        n_bands (int): number of bands. The input integer Tensor's value is in the range [0, n_bans).
-        dtype (str, optional): data type of the output.
-    Returns:
-        Tensor: the dequantized tensor, dtype is specified by dtype.
+    Parameters
+    -----------
+    quantized : Tensor [dtype: int]
+        The quantized value in the range [0, n_bands).
+        
+    n_bands : int
+        Number of bands. The input integer Tensor's value is in the range 
+        [0, n_bans).
+        
+    dtype : str, optional
+        Data type of the output.
+        
+    Returns
+    -----------
+    Tensor
+        The dequantized tensor, dtype is specified by `dtype`. If `dtype` is 
+        not specified, the default float data type is used.
    """
    dtype = dtype or paddle.get_default_dtype()
    value = (paddle.cast(quantized, dtype) + 0.5) * (2.0 / n_bands) - 1.0
@ -37,15 +57,36 @@ def dequantize(quantized, n_bands, dtype=None):


 class STFT(nn.Layer):
-    def __init__(self, n_fft, hop_length, win_length, window="hanning"):
-        """A module for computing differentiable stft transform. See `librosa.stft` for more details.
+    """A module for computing stft transformation in a differentiable way. 
+    
+    Parameters
+    ------------
+    n_fft : int
+        Number of samples in a frame.
+        
+    hop_length : int
+        Number of samples shifted between adjacent frames.
+        
+    win_length : int
+        Length of the window.
+        
+    window : str, optional
+        Name of window function, see `scipy.signal.get_window` for more 
+        details. Defaults to "hanning".
+        
+    Notes
+    -----------
+    It behaves like ``librosa.core.stft``. See ``librosa.core.stft`` for more 
+    details.
+    
+    Given a audio which ``T`` samples, it the STFT transformation outputs a 
+    spectrum with (C, frames) and complex dtype, where ``C = 1 + n_fft / 2`` 
+    and ``frames = 1 + T // hop_lenghth``.
+    
+    Ony ``center`` and ``reflect`` padding is supported now.
    
-        Args:
-            n_fft (int): number of samples in a frame.
-            hop_length (int): number of samples shifted between adjacent frames.
-            win_length (int): length of the window function.
-            window (str, optional): name of window function, see `scipy.signal.get_window` for more details. Defaults to "hanning".
    """
+    def __init__(self, n_fft, hop_length, win_length, window="hanning"):
        super(STFT, self).__init__()
        self.hop_length = hop_length
        self.n_bin = 1 + n_fft // 2
@ -73,13 +114,18 @@ class STFT(nn.Layer):
    def forward(self, x):
        """Compute the stft transform.

-        Args:
-            x (Variable): shape(B, T), dtype flaot32, the input waveform.
+        Parameters
+        ------------
+        x : Tensor [shape=(B, T)]
+            The input waveform.

-        Returns:
-            (real, imag)
-            real (Variable): shape(B, C, 1, T), dtype flaot32, the real part of the spectrogram. (C = 1 + n_fft // 2)
-            imag (Variable): shape(B, C, 1, T), dtype flaot32, the image part of the spectrogram. (C = 1 + n_fft // 2) 
+        Returns
+        ------------
+        real : Tensor [shape=(B, C, 1, frames)] 
+            The real part of the spectrogram.
+            
+        imag : Tensor [shape=(B, C, 1, frames)] 
+            The image part of the spectrogram.
        """
        # x(batch_size, time_steps)
        # pad it first with reflect mode
@ -95,30 +141,34 @@ class STFT(nn.Layer):
        return real, imag

    def power(self, x):
-        """Compute the power spectrogram.
+        """Compute the power spectrum.

-        Args:
-            (real, imag)
-            real (Variable): shape(B, C, 1, T), dtype flaot32, the real part of the spectrogram.
-            imag (Variable): shape(B, C, 1, T), dtype flaot32, the image part of the spectrogram.
+        Parameters
+        ------------
+        x : Tensor [shape=(B, T)]
+            The input waveform.

-        Returns:
-            Variable: shape(B, C, 1, T), dtype flaot32, the power spectrogram.
+        Returns
+        ------------
+        Tensor [shape=(B, C, 1, T)] 
+            The power spectrum.
        """
        real, imag = self(x)
        power = real**2 + imag**2
        return power

    def magnitude(self, x):
-        """Compute the magnitude spectrogram.
+        """Compute the magnitude of the spectrum.

-        Args:
-            (real, imag)
-            real (Variable): shape(B, C, 1, T), dtype flaot32, the real part of the spectrogram.
-            imag (Variable): shape(B, C, 1, T), dtype flaot32, the image part of the spectrogram.
+        Parameters
+        ------------
+        x : Tensor [shape=(B, T)]
+            The input waveform.

-        Returns:
-            Variable: shape(B, C, 1, T), dtype flaot32, the magnitude spectrogram. It is the square root of the power spectrogram.
+        Returns
+        ------------
+        Tensor [shape=(B, C, 1, T)] 
+            The magnitude of the spectrum.
        """
        power = self.power(x)
        magnitude = paddle.sqrt(power)
--- a/parakeet/modules/cbhg.py
+++ b/parakeet/modules/cbhg.py
@ -1,90 +0,0 @@
-import math
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-from paddle.nn import initializer as I
-
-from parakeet.modules.conv import Conv1dBatchNorm
-
-
-class Highway(nn.Layer):
-    def __init__(self, num_features):
-        super(Highway, self).__init__()
-        self.H = nn.Linear(num_features, num_features)
-        self.T = nn.Linear(num_features, num_features,
-                           bias_attr=I.Constant(-1.))
-
-        self.num_features = num_features
-
-    def forward(self, x):
-        H = F.relu(self.H(x))
-        T = F.sigmoid(self.T(x))  # gate
-        return H * T + x * (1.0 - T)
-
-
-class CBHG(nn.Layer):
-    def __init__(self, in_channels, out_channels_per_conv, max_kernel_size,
-                 projection_channels,
-                 num_highways, highway_features,
-                 gru_features):
-        super(CBHG, self).__init__()
-        self.conv1d_banks = nn.LayerList(
-            [Conv1dBatchNorm(in_channels, out_channels_per_conv, (k,),
-                             padding=((k - 1) // 2, k // 2))
-             for k in range(1, 1 + max_kernel_size)])
-
-        self.projections = nn.LayerList()
-        projection_channels = list(projection_channels)
-        proj_in_channels = [max_kernel_size *
-                            out_channels_per_conv] + projection_channels
-        proj_out_channels = projection_channels + \
-            [in_channels]  # ensure residual connection
-        for c_in, c_out in zip(proj_in_channels, proj_out_channels):
-            conv = nn.Conv1D(c_in, c_out, (3,), padding=(1, 1))
-            self.projections.append(conv)
-
-        if in_channels != highway_features:
-            self.pre_highway = nn.Linear(in_channels, highway_features)
-
-        self.highways = nn.LayerList(
-            [Highway(highway_features) for _ in range(num_highways)])
-
-        self.gru = nn.GRU(highway_features, gru_features,
-                          direction="bidirectional")
-
-        self.in_channels = in_channels
-        self.out_channels_per_conv = out_channels_per_conv
-        self.max_kernel_size = max_kernel_size
-        self.num_projections = 1 + len(projection_channels)
-        self.num_highways = num_highways
-        self.highway_features = highway_features
-        self.gru_features = gru_features
-
-    def forward(self, x):
-        input = x
-
-        # conv banks
-        conv_outputs = []
-        for conv in self.conv1d_banks:
-            conv_outputs.append(conv(x))
-        x = F.relu(paddle.concat(conv_outputs, 1))
-
-        # max pool
-        x = F.max_pool1d(x, 2, stride=1, padding=(0, 1))
-
-        # conv1d projections
-        n_projections = len(self.projections)
-        for i, conv in enumerate(self.projections):
-            x = conv(x)
-            if i != n_projections:
-                x = F.relu(x)
-        x += input  # residual connection
-
-        # highway
-        x = paddle.transpose(x, [0, 2, 1])
-        if hasattr(self, "pre_highway"):
-            x = self.pre_highway(x)
-
-        # gru
-        x, _ = self.gru(x)
-        return x
--- a/parakeet/modules/connections.py
+++ b/parakeet/modules/connections.py
@ -1,62 +0,0 @@
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-
-def residual_connection(input, layer):
-    """residual connection, only used for single input-single output layer.
-    y = x + F(x) where F corresponds to the layer.
-
-    Args:
-        x (Tensor): the input tensor.
-        layer (callable): a callable that preserve tensor shape.
-    """
-    return input + layer(input)
-
-class ResidualWrapper(nn.Layer):
-    def __init__(self, layer):
-        super(ResidualWrapper, self).__init__()
-        self.layer = layer
-    
-    def forward(self, x):
-        return residual_connection(x, self.layer)
-
-
-class PreLayerNormWrapper(nn.Layer):
-    def __init__(self, layer, d_model):
-        super(PreLayerNormWrapper, self).__init__()
-        self.layer = layer
-        self.layer_norm = nn.LayerNorm([d_model], epsilon=1e-6)
-    
-    def forward(self, x):
-        return x + self.layer(self.layer_norm(x))
-
-
-class PostLayerNormWrapper(nn.Layer):
-    def __init__(self, layer, d_model):
-        super(PostLayerNormWrapper, self).__init__()
-        self.layer = layer
-        self.layer_norm = nn.LayerNorm([d_model], epsilon=1e-6)
-    
-    def forward(self, x):
-        return self.layer_norm(x + self.layer(x))
-
-
-def context_gate(input, axis):
-    """sigmoid gate the content by gate.
-
-    Args:
-        input (Tensor): shape(*, d_axis, *), the input, treated as content & gate.
-        axis (int): the axis to chunk content and gate.
-
-    Raises:
-        ValueError: if input.shape[axis] is not even.
-
-    Returns:
-        Tensor: shape(*, d_axis / 2 , *), the gated content.
-    """
-    size = input.shape[axis]
-    if size % 2 != 0:
-        raise ValueError("the size of the {}-th dimension of input should "
-                         "be even, but received {}".format(axis, size))
-    content, gate = paddle.chunk(input, 2, axis)
-    return F.sigmoid(gate) * content
--- a/parakeet/modules/conv.py
+++ b/parakeet/modules/conv.py
@ -15,19 +15,69 @@
 import paddle
 from paddle import nn

+__all__ = [
+    "Conv1dCell",
+    "Conv1dBatchNorm",
+]

 class Conv1dCell(nn.Conv1D):
-    """
-    A subclass of Conv1d layer, which can be used like an RNN cell. It can take 
-    step input and return step output. It is done by keeping an internal buffer, 
-    when adding a step input, we shift the buffer and return a step output. For 
-    single step case, convolution devolves to a linear transformation.
+    """A subclass of Conv1D layer, which can be used in an autoregressive 
+    decoder like an RNN cell. 
+    
+    When used in autoregressive decoding, it performs causal temporal 
+    convolution incrementally. At each time step, it takes a step input and 
+    returns a step output. 
+    
+    Notes
+    ------
+    It is done by caching an internal buffer of length ``receptive_file - 1``. 
+    when adding a step input, the buffer is shited by one step, the latest 
+    input is added to be buffer and the oldest step is discarded. And it 
+    returns a step output. For single step case, convolution is equivalent to a 
+    linear transformation.
    
    That it can be used as a cell depends on several restrictions:
-    1. stride must be 1;
-    2. padding must be an asymmetric padding (recpetive_field - 1, 0).
    
-    As a result, these arguments are removed form the initializer.
+    1. stride must be 1;
+    2. padding must be a causal padding (recpetive_field - 1, 0).
+    
+    Thus, these arguments are removed from the ``__init__`` method of this 
+    class.
+    
+    Parameters
+    ----------
+    in_channels: int
+        The feature size of the input.
+        
+    out_channels: int
+        The feature size of the output.
+        
+    kernel_size: int or Tuple[int]
+        The size of the kernel.
+        
+    dilation: int or Tuple[int]
+        The dilation of the convolution, by default 1
+        
+    weight_attr: ParamAttr, Initializer, str or bool, optional
+        The parameter attribute of the convolution kernel, by default None.
+        
+    bias_attr: ParamAttr, Initializer, str or bool, optional
+        The parameter attribute of the bias. If ``False``, this layer does not 
+        have a bias, by default None.
+        
+    Examples
+    --------
+    >>> cell = Conv1dCell(3, 4, kernel_size=5)
+    >>> inputs = [paddle.randn([4, 3]) for _ in range(16)]
+    >>> outputs = []
+    >>> cell.eval()
+    >>> cell.start_sequence()
+    >>> for xt in inputs:
+    >>>     outputs.append(cell.add_input(xt))
+    >>> len(outputs))
+    16
+    >>> outputs[0].shape
+    [4, 4]
    """

    def __init__(self,
@ -54,9 +104,23 @@ class Conv1dCell(nn.Conv1D):

    @property
    def receptive_field(self):
+        """The receptive field of the Conv1dCell.
+        """
        return self._r

    def start_sequence(self):
+        """Prepare the layer for a series of incremental forward.
+        
+        Warnings
+        ---------
+        This method should be called before a sequence of calls to 
+        ``add_input``.
+
+        Raises
+        ------
+        Exception
+            If this method is called when the layer is in training mode.
+        """
        if self.training:
            raise Exception("only use start_sequence in evaluation")
        self._buffer = None
@ -72,21 +136,41 @@ class Conv1dCell(nn.Conv1D):
                                               (self._out_channels, -1))

    def initialize_buffer(self, x_t):
+        """Initialize the buffer for the step input.
+
+        Parameters
+        ----------
+        x_t : Tensor [shape=(batch_size, in_channels)]
+            The step input.
+        """
        batch_size, _ = x_t.shape
        self._buffer = paddle.zeros(
            (batch_size, self._in_channels, self.receptive_field),
            dtype=x_t.dtype)

    def update_buffer(self, x_t):
+        """Shift the buffer by one step.
+
+        Parameters
+        ----------
+        x_t : Tensor [shape=(batch_size, in_channels)]
+            The step input.
+        """
        self._buffer = paddle.concat(
            [self._buffer[:, :, 1:], paddle.unsqueeze(x_t, -1)], -1)

    def add_input(self, x_t):
-        """
-        Arguments:
-            x_t (Tensor): shape (batch_size, in_channels), step input.
-        Rerurns:
-            y_t (Tensor): shape (batch_size, out_channels), step output.
+        """Add step input and compute step output.
+        
+        Parameters
+        -----------
+        x_t : Tensor [shape=(batch_size, in_channels)] 
+            The step input.
+            
+        Returns
+        -------
+        y_t :Tensor [shape=(batch_size, out_channels)] 
+            The step output.
        """
        batch_size = x_t.shape[0]
        if self.receptive_field > 1:
@ -108,6 +192,45 @@ class Conv1dCell(nn.Conv1D):


 class Conv1dBatchNorm(nn.Layer):
+    """A Conv1D Layer followed by a BatchNorm1D.
+
+    Parameters
+    ----------
+    in_channels : int
+        The feature size of the input.
+        
+    out_channels : int
+        The feature size of the output.
+        
+    kernel_size : int
+        The size of the convolution kernel.
+        
+    stride : int, optional
+        The stride of the convolution, by default 1.
+        
+    padding : int, str or Tuple[int], optional
+        The padding of the convolution. 
+        If int, a symmetrical padding is applied before convolution;
+        If str, it should be "same" or "valid";
+        If Tuple[int], its length should be 2, meaning 
+        ``(pad_before, pad_after)``, by default 0.
+        
+    weight_attr : ParamAttr, Initializer, str or bool, optional
+        The parameter attribute of the convolution kernel, by default None.
+        
+    bias_attr : ParamAttr, Initializer, str or bool, optional
+        The parameter attribute of the bias of the convolution, by default 
+        None.
+        
+    data_format : str ["NCL" or "NLC"], optional
+        The data layout of the input, by default "NCL"
+        
+    momentum : float, optional
+        The momentum of the BatchNorm1D layer, by default 0.9
+        
+    epsilon : [type], optional
+        The epsilon of the BatchNorm1D layer, by default 1e-05
+    """
    def __init__(self,
                 in_channels,
                 out_channels,
@ -136,6 +259,18 @@ class Conv1dBatchNorm(nn.Layer):
            data_format=data_format)

    def forward(self, x):
+        """Forward pass of the Conv1dBatchNorm layer.
+
+        Parameters
+        ----------
+        x : Tensor [shape=(B, C_in, T_in) or (B, T_in, C_in)]
+            The input tensor. Its data layout depends on ``data_format``.
+
+        Returns
+        -------
+        Tensor [shape=(B, C_out, T_out) or (B, T_out, C_out)]
+            The output tensor. 
+        """
        x = self.conv(x)
        x = self.bn(x)
        return x
--- a/parakeet/modules/geometry.py
+++ b/parakeet/modules/geometry.py
@ -4,16 +4,25 @@ import paddle
 def shuffle_dim(x, axis, perm=None):
    """Permute input tensor along aixs given the permutation or randomly.

-    Args:
-        x (Tensor): shape(*, d_{axis}, *), the input tensor.
-        axis (int): the axis to shuffle.
-        perm (list[int], ndarray, optional): a permutation of [0, d_{axis}), 
-            the order to reorder the tensor along the `axis`-th dimension, if 
-            not provided, randomly shuffle the `axis`-th dimension. Defaults to 
-            None.
+    Parameters
+    ----------
+    x : Tensor
+        The input tensor.
        
-    Returns:
-        Tensor: the shuffled tensor, it has the same shape as x does.
+    axis : int
+        The axis to shuffle.
+        
+    perm : List[int], ndarray, optional
+        The order to reorder the tensor along the ``axis``-th dimension.
+        
+        It is a permutation of ``[0, d)``, where d is the size of the 
+        ``axis``-th dimension of the input tensor. If not provided, 
+        a random permutation is used. Defaults to None.
+
+    Returns
+    ---------
+    Tensor
+        The shuffled tensor, which has the same shape as x does.
    """
    size = x.shape[axis]
    if perm is not None and len(perm) != size:
--- a/parakeet/modules/losses.py
+++ b/parakeet/modules/losses.py
@ -4,29 +4,128 @@ import paddle
 from paddle import nn
 from paddle.nn import functional as F

+__all__ = [
+    "weighted_mean", 
+    "masked_l1_loss", 
+    "masked_softmax_with_cross_entropy", 
+    "diagonal_loss",
+]
+
 def weighted_mean(input, weight):
-    """weighted mean.(It can also be used as masked mean.)
+    """Weighted mean. It can also be used as masked mean.

-    Args:
-        input (Tensor): input tensor, floating point dtype.
-        weight (Tensor): weight tensor with broadcastable shape.
+    Parameters
+    -----------
+    input : Tensor 
+        The input tensor.
+    weight : Tensor
+        The weight tensor with broadcastable shape with the input.

-    Returns:
-        Tensor: shape(1,), weighted mean tensor with the same dtype as input.
+    Returns
+    ----------
+    Tensor [shape=(1,)]
+        Weighted mean tensor with the same dtype as input.
+        
+    Warnings
+    ---------
+    This is not a mathematical weighted mean. It performs weighted sum and 
+    simple average.
    """
    weight = paddle.cast(weight, input.dtype)
    return paddle.mean(input * weight)

+
 def masked_l1_loss(prediction, target, mask):
+    """Compute maksed L1 loss.
+
+    Parameters
+    ----------
+    prediction : Tensor
+        The prediction.
+        
+    target : Tensor
+        The target. The shape should be broadcastable to ``prediction``.
+        
+    mask : Tensor
+        The mask. The shape should be broadcatable to the broadcasted shape of 
+        ``prediction`` and ``target``.
+
+    Returns
+    -------
+    Tensor [shape=(1,)]
+        The masked L1 loss.
+    """
    abs_error = F.l1_loss(prediction, target, reduction='none')
-    return weighted_mean(abs_error, mask)
+    loss = weighted_mean(abs_error, mask)
+    return loss
+

 def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1):
-    ce = F.softmax_with_cross_entropy(logits, label, axis=axis)
-    return weighted_mean(ce, mask)
+    """Compute masked softmax with cross entropy loss.

-def diagonal_loss(attentions, input_lengths, target_lengths, g=0.2, multihead=False):
-    """A metric to evaluate how diagonal a attention distribution is."""
+    Parameters
+    ----------
+    logits : Tensor
+        The logits. The ``axis``-th axis is the class dimension.
+        
+    label : Tensor [dtype: int]
+        The label. The size of the ``axis``-th axis should be 1.
+        
+    mask : Tensor 
+        The mask. The shape should be broadcastable to ``label``.
+        
+    axis : int, optional
+        The index of the class dimension in the shape of ``logits``, by default
+        -1.
+
+    Returns
+    -------
+    Tensor [shape=(1,)]
+        The masked softmax with cross entropy loss.
+    """
+    ce = F.softmax_with_cross_entropy(logits, label, axis=axis)
+    loss = weighted_mean(ce, mask)
+    return loss
+
+
+def diagonal_loss(
+    attentions, 
+    input_lengths, 
+    target_lengths, 
+    g=0.2, 
+    multihead=False):
+    """A metric to evaluate how diagonal a attention distribution is.
+    
+    It is computed for batch attention distributions. For each attention 
+    distribution, the valid decoder time steps and encoder time steps may
+    differ.
+
+    Parameters
+    ----------
+    attentions : Tensor [shape=(B, T_dec, T_enc) or (B, H, T_dec, T_dec)]
+        The attention weights from an encoder-decoder structure.
+        
+    input_lengths : Tensor [shape=(B,)]
+        The valid length for each encoder output.
+        
+    target_lengths : Tensor [shape=(B,)]
+        The valid length for each decoder output.
+        
+    g : float, optional
+        [description], by default 0.2.
+        
+    multihead : bool, optional
+        A flag indicating whether ``attentions`` is a multihead attention's
+        attention distribution. 
+        
+        If ``True``, the shape of attention is ``(B, H, T_dec, T_dec)``, by 
+        default False.
+
+    Returns
+    -------
+    Tensor [shape=(1,)]
+        The diagonal loss.
+    """
    W = guided_attentions(input_lengths, target_lengths, g)
    W_tensor = paddle.to_tensor(W)
    if not multihead:
--- a/parakeet/modules/masking.py
+++ b/parakeet/modules/masking.py
@ -1,32 +1,114 @@
 import paddle
 from paddle.fluid.layers import sequence_mask

+__all__ = [
+    "id_mask",
+    "feature_mask",
+    "combine_mask",
+    "future_mask",
+]
+
 def id_mask(input, padding_index=0, dtype="bool"):
+    """Generate mask with input ids. 
+    
+    Those positions where the value equals ``padding_index`` correspond to 0 or
+    ``False``, otherwise, 1 or ``True``.
+
+    Parameters
+    ----------
+    input : Tensor [dtype: int]
+        The input tensor. It represents the ids.
+        
+    padding_index : int, optional
+        The id which represents padding, by default 0.
+        
+    dtype : str, optional
+        Data type of the returned mask, by default "bool".
+
+    Returns
+    -------
+    Tensor
+        The generate mask. It has the same shape as ``input`` does.
+    """
    return paddle.cast(input != padding_index, dtype)

+
 def feature_mask(input, axis, dtype="bool"):
+    """Compute mask from input features.
+    
+    For a input features, represented as batched feature vectors, those vectors
+    which all zeros are considerd padding vectors.
+
+    Parameters
+    ----------
+    input : Tensor [dtype: float]
+        The input tensor which represents featues.
+        
+    axis : int
+        The index of the feature dimension in ``input``. Other dimensions are
+        considered ``spatial`` dimensions.
+        
+    dtype : str, optional
+        Data type of the generated mask, by default "bool"
+
+    Returns
+    -------
+    Tensor
+        The geenrated mask with ``spatial`` shape as mentioned above.
+        
+        It has one less dimension than ``input`` does.
+    """
    feature_sum = paddle.sum(paddle.abs(input), axis)
    return paddle.cast(feature_sum != 0, dtype)

-def combine_mask(padding_mask, no_future_mask):
-    """
-    Combine the padding mask and no future mask for transformer decoder. 
-    Padding mask is used to mask padding positions and no future mask is used 
-    to prevent the decoder to see future information.

-    Args:
-        padding_mask (Tensor): shape(batch_size, time_steps), dtype: float32 or float64, decoder padding mask. 
-        no_future_mask (Tensor): shape(time_steps, time_steps), dtype: float32 or float64, no future mask.
+def combine_mask(mask1, mask2):
+    """Combine two mask with multiplication or logical and.

-    Returns:
-        Tensor: shape(batch_size, time_steps, time_steps), combined mask.
+    Parameters
+    -----------
+    mask1 : Tensor
+        The first mask. 
+        
+    mask2 : Tensor
+        The second mask with broadcastable shape with ``mask1``.
+        
+    Returns
+    --------
+    Tensor
+        Combined mask.
+        
+    Notes
+    ------
+    It is mainly used to combine the padding mask and no future mask for 
+    transformer decoder. 
+    
+    Padding mask is used to mask padding positions of the decoder inputs and 
+    no future mask is used to prevent the decoder to see future information.
    """
-    # TODO: to support boolean mask by using logical_and?
-    if padding_mask.dtype == paddle.fluid.core.VarDesc.VarType.BOOL:
-        return paddle.logical_and(padding_mask, no_future_mask)
+    if mask1.dtype == paddle.fluid.core.VarDesc.VarType.BOOL:
+        return paddle.logical_and(mask1, mask2)
    else:
-        return padding_mask * no_future_mask
+        return mask1 * mask2
+

 def future_mask(time_steps, dtype="bool"):
+    """Generate lower triangular mask.
+    
+    It is used at transformer decoder to prevent the decoder to see future 
+    information.
+
+    Parameters
+    ----------
+    time_steps : int
+        Decoder time steps.
+    dtype : str, optional
+        The data type of the generate mask, by default "bool".
+
+    Returns
+    -------
+    Tensor
+        The generated mask.
+    """
    mask = paddle.tril(paddle.ones([time_steps, time_steps]))
    return paddle.cast(mask, dtype)
--- a/parakeet/modules/positional_encoding.py
+++ b/parakeet/modules/positional_encoding.py
@ -3,21 +3,34 @@ import numpy as np
 import paddle
 from paddle.nn import functional as F

+__all__ = ["positional_encoding"]

 def positional_encoding(start_index, length, size, dtype=None):
-    """
-    Generate standard positional encoding.
+    r"""Generate standard positional encoding matrix.
    
-    pe(pos, 2i) = sin(pos / 10000 ** (2i / size))
-    pe(pos, 2i+1) = cos(pos / 10000 ** (2i / size))
+    .. math::
    
-    Args:
-        start_index (int): the start index.
-        length (int): the length of the positional encoding.
-        size (int): positional encoding dimension.
+        pe(pos, 2i) = sin(\frac{pos}{10000^{\frac{2i}{size}}}) \\
+        pe(pos, 2i+1) = cos(\frac{pos}{10000^{\frac{2i}{size}}})
    
-    Returns:
-        encodings (Tensor): shape(length, size), the positional encoding.
+    Parameters
+    ----------
+    start_index : int
+        The start index.
+    length : int
+        The timesteps of the positional encoding to generate.
+    size : int 
+        Feature size of positional encoding.
+    
+    Returns
+    -------
+    Tensor [shape=(length, size)]
+        The positional encoding.
+        
+    Raises
+    ------
+    ValueError
+        If ``size`` is not divisible by 2.
    """
    if (size % 2 != 0):
        raise ValueError("size should be divisible by 2")
--- a/parakeet/modules/transformer.py
+++ b/parakeet/modules/transformer.py
@ -5,23 +5,35 @@ from paddle.nn import functional as F

 from parakeet.modules import attention as attn
 from parakeet.modules.masking import combine_mask
+
+__all__ = [
+    "PositionwiseFFN",
+    "TransformerEncoderLayer",
+    "TransformerDecoderLayer",
+]
+
 class PositionwiseFFN(nn.Layer):
-    """
-    A faithful implementation of Position-wise Feed-Forward Network 
+    """A faithful implementation of Position-wise Feed-Forward Network 
    in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
-    It is basically a 3-layer MLP, with relu actication and dropout in between.
+    It is basically a 2-layer MLP, with relu actication and dropout in between.
+    
+    Parameters
+    ----------
+    input_size: int
+        The feature size of the intput. It is also the feature size of the 
+        output.
+        
+    hidden_size: int
+        The hidden size.
+        
+    dropout: float
+        The probability of the Dropout applied to the output of the first 
+        layer, by default 0.
    """
    def __init__(self, 
                 input_size: int, 
                 hidden_size: int, 
                 dropout=0.0):
-        """
-        Args:
-            input_size (int): the input feature size.
-            hidden_size (int): the hidden layer's feature size.
-            dropout (float, optional): probability of dropout applied to the 
-                output of the first fully connected layer. Defaults to 0.0.
-        """
        super(PositionwiseFFN, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, input_size)
@ -31,13 +43,17 @@ class PositionwiseFFN(nn.Layer):
        self.hidden_szie = hidden_size

    def forward(self, x):
-        """positionwise feed forward network.
+        r"""Forward pass of positionwise feed forward network.

-        Args:
-            x (Tensor): shape(*, input_size), the input tensor.
+        Parameters
+        ----------
+        x : Tensor [shape=(\*, input_size)]
+            The input tensor, where ``\*`` means arbitary shape.

-        Returns:
-            Tensor: shape(*, input_size), the output tensor.
+        Returns
+        -------
+        Tensor [shape=(\*, input_size)]
+            The output tensor.
        """
        l1 = self.dropout(F.relu(self.linear1(x)))
        l2 = self.linear2(l1)
@ -45,18 +61,32 @@ class PositionwiseFFN(nn.Layer):


 class TransformerEncoderLayer(nn.Layer):
-    """
-    Transformer encoder layer.
+    """A faithful implementation of Transformer encoder layer in 
+    `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
+    
+    Parameters
+    ----------
+    d_model :int 
+        The feature size of the input. It is also the feature size of the 
+        output.
+        
+    n_heads : int
+        The number of heads of self attention (a ``MultiheadAttention`` 
+        layer).
+        
+    d_ffn : int 
+        The hidden size of the positional feed forward network (a 
+        ``PositionwiseFFN`` layer).
+        
+    dropout : float, optional
+        The probability of the dropout in MultiHeadAttention and 
+        PositionwiseFFN, by default 0.
+        
+    Notes
+    ------
+    It uses the PostLN (post layer norm) scheme. 
    """
    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
-        """
-        Args:
-            d_model (int): the feature size of the input, and the output.
-            n_heads (int): the number of heads in the internal MultiHeadAttention layer.
-            d_ffn (int): the hidden size of the internal PositionwiseFFN.
-            dropout (float, optional): the probability of the dropout in 
-                MultiHeadAttention and PositionwiseFFN. Defaults to 0.
-        """
        super(TransformerEncoderLayer, self).__init__()
        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
@ -64,37 +94,68 @@ class TransformerEncoderLayer(nn.Layer):
        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
        
+        self.dropout = dropout
+    
    def forward(self, x, mask):
-        """
-        Args:
-            x (Tensor): shape(batch_size, time_steps, d_model), the decoder input.
-            mask (Tensor): shape(batch_size, time_steps), the padding mask.
+        """Forward pass of TransformerEncoderLayer.
        
-        Returns:
-            (x, attn_weights)
-            x (Tensor): shape(batch_size, time_steps, d_model), the decoded.
-            attn_weights (Tensor), shape(batch_size, n_heads, time_steps, time_steps), self attention.
-        """
-        context_vector, attn_weights = self.self_mha(x, x, x, paddle.unsqueeze(mask, 1))
-        x = self.layer_norm1(x + context_vector)
+        Parameters
+        ----------
+        x : Tensor [shape=(batch_size, time_steps, d_model)]
+            The input.
            
-        x = self.layer_norm2(x + self.ffn(x))
+        mask : Tensor
+            The padding mask. The shape is (batch_size, time_steps, 
+            time_steps) or broadcastable shape.
+        
+        Returns
+        -------
+        x :Tensor [shape=(batch_size, time_steps, d_model)]
+            The encoded output.
+            
+        attn_weights : Tensor [shape=(batch_size, n_heads, time_steps, time_steps)]
+            The attention weights of the self attention.
+        """
+        context_vector, attn_weights = self.self_mha(x, x, x, mask)
+        x = self.layer_norm1(
+            F.dropout(x + context_vector,
+                      self.dropout,
+                      training=self.training))
+        
+        x = self.layer_norm2(
+            F.dropout(x + self.ffn(x),
+                      self.dropout,
+                      training=self.training))
        return x, attn_weights


 class TransformerDecoderLayer(nn.Layer):
-    """
-    Transformer decoder layer.
+    """A faithful implementation of Transformer decoder layer in 
+    `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
+    
+    Parameters
+    ----------
+    d_model :int 
+        The feature size of the input. It is also the feature size of the 
+        output.
+        
+    n_heads : int
+        The number of heads of attentions (``MultiheadAttention`` 
+        layers).
+        
+    d_ffn : int 
+        The hidden size of the positional feed forward network (a 
+        ``PositionwiseFFN`` layer).
+        
+    dropout : float, optional
+        The probability of the dropout in MultiHeadAttention and 
+        PositionwiseFFN, by default 0.
+        
+    Notes
+    ------
+    It uses the PostLN (post layer norm) scheme. 
    """
    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
-        """
-        Args:
-            d_model (int): the feature size of the input, and the output.
-            n_heads (int): the number of heads in the internal MultiHeadAttention layer.
-            d_ffn (int): the hidden size of the internal PositionwiseFFN.
-            dropout (float, optional): the probability of the dropout in 
-                MultiHeadAttention and PositionwiseFFN. Defaults to 0.
-        """
        super(TransformerDecoderLayer, self).__init__()
        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
@ -105,29 +166,51 @@ class TransformerDecoderLayer(nn.Layer):
        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
        self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
        
+        self.dropout = dropout
+    
    def forward(self, q, k, v, encoder_mask, decoder_mask):
+        """Forward pass of TransformerEncoderLayer.
+        
+        Parameters
+        ----------
+        q : Tensor [shape=(batch_size, time_steps_q, d_model)] 
+            The decoder input.
+        k : Tensor [shape=(batch_size, time_steps_k, d_model)] 
+            The keys.
+        v : Tensor [shape=(batch_size, time_steps_k, d_model)]
+            The values
+        encoder_mask : Tensor
+            Encoder padding mask, shape is ``(batch_size, time_steps_k, 
+            time_steps_k)`` or broadcastable shape.
+        decoder_mask : Tensor
+            Decoder mask, shape is ``(batch_size, time_steps_q, time_steps_k)``
+            or broadcastable shape. 
+        
+        Returns
+        --------
+        q : Tensor [shape=(batch_size, time_steps_q, d_model)]
+            The decoder output.
+            
+        self_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_q)]
+            Decoder self attention.
+            
+        cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)] 
+            Decoder-encoder cross attention.
        """
-        Args:
-            q (Tensor): shape(batch_size, time_steps_q, d_model), the decoder input.
-            k (Tensor): shape(batch_size, time_steps_k, d_model), keys.
-            v (Tensor): shape(batch_size, time_steps_k, d_model), values
-            encoder_mask (Tensor): shape(batch_size, time_steps_k) encoder padding mask.
-            decoder_mask (Tensor): shape(batch_size, time_steps_q) decoder padding mask.
+        context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask)
+        q = self.layer_norm1(
+            F.dropout(q + context_vector, 
+                      self.dropout, 
+                      training=self.training))
        
-        Returns:
-            (q, self_attn_weights, cross_attn_weights)
-            q (Tensor): shape(batch_size, time_steps_q, d_model), the decoded.
-            self_attn_weights (Tensor), shape(batch_size, n_heads, time_steps_q, time_steps_q), decoder self attention.
-            cross_attn_weights (Tensor), shape(batch_size, n_heads, time_steps_q, time_steps_k), decoder-encoder cross attention.
-        """
-        tq = q.shape[1]
-        no_future_mask = paddle.tril(paddle.ones([tq, tq])) #(tq, tq)
-        combined_mask = combine_mask(decoder_mask.unsqueeze(1), no_future_mask)
-        context_vector, self_attn_weights = self.self_mha(q, q, q, combined_mask)
-        q = self.layer_norm1(q + context_vector)
+        context_vector, cross_attn_weights = self.cross_mha(q, k, v, encoder_mask)
+        q = self.layer_norm2(
+            F.dropout(q + context_vector,
+                      self.dropout,
+                      training=self.training))
        
-        context_vector, cross_attn_weights = self.cross_mha(q, k, v, paddle.unsqueeze(encoder_mask, 1))
-        q = self.layer_norm2(q + context_vector)
-        
-        q = self.layer_norm3(q + self.ffn(q))
+        q = self.layer_norm3(
+            F.dropout(q + self.ffn(q),
+                      self.dropout,
+                      training=self.training))
        return q, self_attn_weights, cross_attn_weights
--- a/parakeet/training/init.py
+++ b/parakeet/training/init.py
@ -0,0 +1,2 @@
+from parakeet.training.cli import *
+from parakeet.training.experiment import *
--- a/parakeet/training/cli.py
+++ b/parakeet/training/cli.py
@ -1,12 +1,40 @@
 import argparse

 def default_argument_parser():
+    r"""A simple yet genral argument parser for experiments with parakeet.
+    
+    This is used in examples with parakeet. And it is intended to be used by 
+    other experiments with parakeet. It requires a minimal set of command line 
+    arguments to start a training script.
+    
+    The ``--config`` and ``--opts`` are used for overwrite the deault 
+    configuration.
+    
+    The ``--data`` and ``--output`` specifies the data path and output path. 
+    Resuming training from existing progress at the output directory is the 
+    intended default behavior.
+    
+    The ``--checkpoint_path`` specifies the checkpoint to load from.
+    
+    The ``--device`` and ``--nprocs`` specifies how to run the training.
+    
+    
+    See Also
+    --------
+    parakeet.training.experiment
+
+    Returns
+    -------
+    argparse.ArgumentParser
+        the parser
+    """
    parser = argparse.ArgumentParser()

+    # yapf: disable
    # data and outpu
    parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.")
    parser.add_argument("--data", metavar="DATA_DIR", help="path to the datatset.")
-    parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and log. If not provided, a directory is created in runs/ to save outputs.")
+    parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and logs.")

    # load from saved checkpoint
    parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load")
@ -17,5 +45,6 @@ def default_argument_parser():

    # overwrite extra config and default config
    parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
+    # yapd: enable
    
    return parser
--- a/parakeet/training/experiment.py
+++ b/parakeet/training/experiment.py
@ -25,49 +25,67 @@ from collections import defaultdict
 import parakeet
 from parakeet.utils import checkpoint, mp_tools

+__all__ = ["ExperimentBase"]

 class ExperimentBase(object):
    """
-    An experiment template in order to structure the training code and take care of saving, loading, logging, visualization stuffs. It's intended to be flexible and simple. 
+    An experiment template in order to structure the training code and take 
+    care of saving, loading, logging, visualization stuffs. It's intended to 
+    be flexible and simple. 
    
-    So it only handles output directory (create directory for the outut, create a checkpoint directory, dump the config in use and create visualizer and logger)in a standard way without restricting the input/output protocols of the model and dataloader. It leaves the main part for the user to implement their own(setup the model, criterion, optimizer, defaine a training step, define a validation function and customize all the text and visual logs).
+    So it only handles output directory (create directory for the output, 
+    create a checkpoint directory, dump the config in use and create 
+    visualizer and logger) in a standard way without enforcing any
+    input-output protocols to the model and dataloader. It leaves the main 
+    part for the user to implement their own (setup the model, criterion, 
+    optimizer, define a training step, define a validation function and 
+    customize all the text and visual logs).

-    It does not save too much boilerplate code. The users still have to write the forward/backward/update mannually, but they are free to add non-standard behaviors if needed.
+    It does not save too much boilerplate code. The users still have to write 
+    the forward/backward/update mannually, but they are free to add 
+    non-standard behaviors if needed.

    We have some conventions to follow.
-    1. Experiment should have `.model`, `.optimizer`, `.train_loader` and `.valid_loader`, `.config`, `.args` attributes.
-    2. The config should have a `.training` field, which has `valid_interval`, `save_interval` and `max_iteration` keys. It is used as the trigger to invoke validation, checkpointing and stop of the experiment.
-    3. There are four method, namely `train_batch`, `valid`, `setup_model` and `setup_dataloader` that should be implemented.
+    1. Experiment should have ``model``, ``optimizer``, ``train_loader`` and 
+    ``valid_loader``, ``config`` and ``args`` attributes.
+    2. The config should have a ``training`` field, which has 
+    ``valid_interval``, ``save_interval`` and ``max_iteration`` keys. It is 
+    used as the trigger to invoke validation, checkpointing and stop of the 
+    experiment.
+    3. There are four methods, namely ``train_batch``, ``valid``, 
+    ``setup_model`` and ``setup_dataloader`` that should be implemented.

-    Feel free to add/overwrite other methods and standalone functions if you need.
+    Feel free to add/overwrite other methods and standalone functions if you 
+    need.
    
-    Examples:
+    Parameters
+    ----------
+    config: yacs.config.CfgNode
+        The configuration used for the experiment.
+    
+    args: argparse.Namespace
+        The parsed command line arguments.
+
+    Examples
    --------
-    def main_sp(config, args):
-        exp = Experiment(config, args)
-        exp.setup()
-        exp.run()
-
-    def main(config, args):
-        if args.nprocs > 1 and args.device == "gpu":
-            dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
-        else:
-            main_sp(config, args)
-
-    if __name__ == "__main__":
-        config = get_cfg_defaults()
-        parser = default_argument_parser()
-        args = parser.parse_args()
-        if args.config: 
-            config.merge_from_file(args.config)
-        if args.opts:
-            config.merge_from_list(args.opts)
-        config.freeze()
-        print(config)
-        print(args)
-
-        main(config, args)
-
+    >>> def main_sp(config, args):
+    >>>     exp = Experiment(config, args)
+    >>>     exp.setup()
+    >>>     exp.run()
+    >>> 
+    >>> config = get_cfg_defaults()
+    >>> parser = default_argument_parser()
+    >>> args = parser.parse_args()
+    >>> if args.config: 
+    >>>     config.merge_from_file(args.config)
+    >>> if args.opts:
+    >>>     config.merge_from_list(args.opts)
+    >>> config.freeze()
+    >>> 
+    >>> if args.nprocs > 1 and args.device == "gpu":
+    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    >>> else:
+    >>>     main_sp(config, args)
    """

    def __init__(self, config, args):
@ -75,6 +93,8 @@ class ExperimentBase(object):
        self.args = args

    def setup(self):
+        """Setup the experiment.
+        """
        paddle.set_device(self.args.device)
        if self.parallel:
            self.init_parallel()
@ -93,16 +113,29 @@ class ExperimentBase(object):

    @property
    def parallel(self):
+        """A flag indicating whether the experiment should run with 
+        multiprocessing.
+        """
        return self.args.device == "gpu" and self.args.nprocs > 1

    def init_parallel(self):
+        """Init environment for multiprocess training.
+        """
        dist.init_parallel_env()

    def save(self):
+        """Save checkpoint (model parameters and optimizer states).
+        """
        checkpoint.save_parameters(self.checkpoint_dir, self.iteration,
                                   self.model, self.optimizer)

    def resume_or_load(self):
+        """Resume from latest checkpoint at checkpoints in the output 
+        directory or load a specified checkpoint.
+        
+        If ``args.checkpoint_path`` is not None, load the checkpoint, else
+        resume training.
+        """
        iteration = checkpoint.load_parameters(
            self.model,
            self.optimizer,
@ -111,6 +144,13 @@ class ExperimentBase(object):
        self.iteration = iteration

    def read_batch(self):
+        """Read a batch from the train_loader.
+
+        Returns
+        -------
+        List[Tensor]
+            A batch.
+        """
        try:
            batch = next(self.iterator)
        except StopIteration:
@ -119,12 +159,19 @@ class ExperimentBase(object):
        return batch

    def new_epoch(self):
+        """Reset the train loader and increment ``epoch``.
+        """
        self.epoch += 1
        if self.parallel:
            self.train_loader.batch_sampler.set_epoch(self.epoch)
        self.iterator = iter(self.train_loader)

    def train(self):
+        """The training process.
+        
+        It includes forward/backward/update and periodical validation and 
+        saving.
+        """
        self.new_epoch()
        while self.iteration < self.config.training.max_iteration:
            self.iteration += 1
@ -137,6 +184,9 @@ class ExperimentBase(object):
                self.save()

    def run(self):
+        """The routine of the experiment after setup. This method is intended
+        to be used by the user.
+        """
        self.resume_or_load()
        try:
            self.train()
@ -146,6 +196,8 @@ class ExperimentBase(object):

    @mp_tools.rank_zero_only
    def setup_output_dir(self):
+        """Create a directory used for output.
+        """
        # output dir
        output_dir = Path(self.args.output).expanduser()
        output_dir.mkdir(parents=True, exist_ok=True)
@ -154,6 +206,10 @@ class ExperimentBase(object):

    @mp_tools.rank_zero_only
    def setup_checkpointer(self):
+        """Create a directory used to save checkpoints into.
+        
+        It is "checkpoints" inside the output directory.
+        """
        # checkpoint dir
        checkpoint_dir = self.output_dir / "checkpoints"
        checkpoint_dir.mkdir(exist_ok=True)
@ -162,12 +218,28 @@ class ExperimentBase(object):

    @mp_tools.rank_zero_only
    def setup_visualizer(self):
+        """Initialize a visualizer to log the experiment.
+        
+        The visual log is saved in the output directory.
+        
+        Notes
+        ------
+        Only the main process has a visualizer with it. Use multiple 
+        visualizers in multiprocess to write to a same log file may cause 
+        unexpected behaviors.
+        """
        # visualizer
        visualizer = SummaryWriter(logdir=str(self.output_dir))

        self.visualizer = visualizer

    def setup_logger(self):
+        """Initialize a text logger to log the experiment.
+        
+        Each process has its own text logger. The logging message is write to 
+        the standard output and a text file named ``worker_n.log`` in the 
+        output directory, where ``n`` means the rank of the process. 
+        """
        logger = logging.getLogger(__name__)
        logger.setLevel("INFO")
        logger.addHandler(logging.StreamHandler())
@ -178,19 +250,34 @@ class ExperimentBase(object):

    @mp_tools.rank_zero_only
    def dump_config(self):
+        """Save the configuration used for this experiment. 
+        
+        It is saved in to ``config.yaml`` in the output directory at the 
+        beginning of the experiment.
+        """
        with open(self.output_dir / "config.yaml", 'wt') as f:
            print(self.config, file=f)

    def train_batch(self):
+        """The training loop. A subclass should implement this method.
+        """
        raise NotImplementedError("train_batch should be implemented.")

    @mp_tools.rank_zero_only
    @paddle.no_grad()
    def valid(self):
+        """The validation. A subclass should implement this method.
+        """
        raise NotImplementedError("valid should be implemented.")

    def setup_model(self):
+        """Setup model, criterion and optimizer, etc. A subclass should 
+        implement this method.
+        """
        raise NotImplementedError("setup_model should be implemented.")

    def setup_dataloader(self):
+        """Setup training dataloader and validation dataloader. A subclass 
+        should implement this method.
+        """
        raise NotImplementedError("setup_dataloader should be implemented.")
--- a/setup.py
+++ b/setup.py
@ -56,15 +56,14 @@ setup_info = dict(
        'unidecode',
        'numba==0.47.0',
        'tqdm==4.19.8',
+        'llvmlite==0.31.0',
        'matplotlib',
        'visualdl>=2.0.1',
        'scipy',
-        'ruamel.yaml',
        'pandas',
        'sox',
-        'soundfile',
-        'llvmlite==0.31.0',
        'opencc',
+        'soundfile',
        'g2p_en',
        'g2pM',
        'yacs',