Merge pull request #64 from PaddlePaddle/doc

Update docstrings
This commit is contained in:
Feiyu Chan 2020-12-18 20:58:59 +08:00 committed by GitHub
commit badf72d611
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
43 changed files with 2606 additions and 619 deletions

20
doc/Makefile Normal file
View File

@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

35
doc/make.bat Normal file
View File

@ -0,0 +1,35 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd

64
doc/source/conf.py Normal file
View File

@ -0,0 +1,64 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
# -- Project information -----------------------------------------------------
project = 'parakeet'
copyright = '2020, parakeet-developers'
author = 'parakeet-developers'
# The full version, including alpha/beta/rc tags
release = '0.2'
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.viewcode',
"sphinx_rtd_theme",
'sphinx.ext.mathjax',
'numpydoc',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
source_suffix = ['.rst', '.md']
# -- Extension configuration -------------------------------------------------
numpydoc_show_class_members = False

20
doc/source/index.rst Normal file
View File

@ -0,0 +1,20 @@
.. parakeet documentation master file, created by
sphinx-quickstart on Thu Dec 17 20:01:34 2020.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to parakeet's documentation!
====================================
.. toctree::
:maxdepth: 2
:caption: Contents:
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

7
doc/source/modules.rst Normal file
View File

@ -0,0 +1,7 @@
parakeet
========
.. toctree::
:maxdepth: 4
parakeet

View File

@ -0,0 +1,29 @@
parakeet.audio package
======================
Submodules
----------
parakeet.audio.audio module
---------------------------
.. automodule:: parakeet.audio.audio
:members:
:undoc-members:
:show-inheritance:
parakeet.audio.spec\_normalizer module
--------------------------------------
.. automodule:: parakeet.audio.spec_normalizer
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: parakeet.audio
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,29 @@
parakeet.data package
=====================
Submodules
----------
parakeet.data.batch module
--------------------------
.. automodule:: parakeet.data.batch
:members:
:undoc-members:
:show-inheritance:
parakeet.data.dataset module
----------------------------
.. automodule:: parakeet.data.dataset
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: parakeet.data
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,29 @@
parakeet.datasets package
=========================
Submodules
----------
parakeet.datasets.common module
-------------------------------
.. automodule:: parakeet.datasets.common
:members:
:undoc-members:
:show-inheritance:
parakeet.datasets.ljspeech module
---------------------------------
.. automodule:: parakeet.datasets.ljspeech
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: parakeet.datasets
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,37 @@
parakeet.frontend package
=========================
Submodules
----------
parakeet.frontend.phonectic module
----------------------------------
.. automodule:: parakeet.frontend.phonectic
:members:
:undoc-members:
:show-inheritance:
parakeet.frontend.punctuation module
------------------------------------
.. automodule:: parakeet.frontend.punctuation
:members:
:undoc-members:
:show-inheritance:
parakeet.frontend.vocab module
------------------------------
.. automodule:: parakeet.frontend.vocab
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: parakeet.frontend
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,45 @@
parakeet.models package
=======================
Submodules
----------
parakeet.models.tacotron2 module
--------------------------------
.. automodule:: parakeet.models.tacotron2
:members:
:undoc-members:
:show-inheritance:
parakeet.models.transformer\_tts module
---------------------------------------
.. automodule:: parakeet.models.transformer_tts
:members:
:undoc-members:
:show-inheritance:
parakeet.models.waveflow module
-------------------------------
.. automodule:: parakeet.models.waveflow
:members:
:undoc-members:
:show-inheritance:
parakeet.models.wavenet module
------------------------------
.. automodule:: parakeet.models.wavenet
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: parakeet.models
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,77 @@
parakeet.modules package
========================
Submodules
----------
parakeet.modules.attention module
---------------------------------
.. automodule:: parakeet.modules.attention
:members:
:undoc-members:
:show-inheritance:
parakeet.modules.audio module
-----------------------------
.. automodule:: parakeet.modules.audio
:members:
:undoc-members:
:show-inheritance:
parakeet.modules.conv module
----------------------------
.. automodule:: parakeet.modules.conv
:members:
:undoc-members:
:show-inheritance:
parakeet.modules.geometry module
--------------------------------
.. automodule:: parakeet.modules.geometry
:members:
:undoc-members:
:show-inheritance:
parakeet.modules.losses module
------------------------------
.. automodule:: parakeet.modules.losses
:members:
:undoc-members:
:show-inheritance:
parakeet.modules.masking module
-------------------------------
.. automodule:: parakeet.modules.masking
:members:
:undoc-members:
:show-inheritance:
parakeet.modules.positional\_encoding module
--------------------------------------------
.. automodule:: parakeet.modules.positional_encoding
:members:
:undoc-members:
:show-inheritance:
parakeet.modules.transformer module
-----------------------------------
.. automodule:: parakeet.modules.transformer
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: parakeet.modules
:members:
:undoc-members:
:show-inheritance:

25
doc/source/parakeet.rst Normal file
View File

@ -0,0 +1,25 @@
parakeet package
================
Subpackages
-----------
.. toctree::
:maxdepth: 4
parakeet.audio
parakeet.data
parakeet.datasets
parakeet.frontend
parakeet.models
parakeet.modules
parakeet.training
parakeet.utils
Module contents
---------------
.. automodule:: parakeet
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,37 @@
parakeet.training package
=========================
Submodules
----------
parakeet.training.cli module
----------------------------
.. automodule:: parakeet.training.cli
:members:
:undoc-members:
:show-inheritance:
parakeet.training.default\_config module
----------------------------------------
.. automodule:: parakeet.training.default_config
:members:
:undoc-members:
:show-inheritance:
parakeet.training.experiment module
-----------------------------------
.. automodule:: parakeet.training.experiment
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: parakeet.training
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,61 @@
parakeet.utils package
======================
Submodules
----------
parakeet.utils.checkpoint module
--------------------------------
.. automodule:: parakeet.utils.checkpoint
:members:
:undoc-members:
:show-inheritance:
parakeet.utils.display module
-----------------------------
.. automodule:: parakeet.utils.display
:members:
:undoc-members:
:show-inheritance:
parakeet.utils.internals module
-------------------------------
.. automodule:: parakeet.utils.internals
:members:
:undoc-members:
:show-inheritance:
parakeet.utils.layer\_tools module
----------------------------------
.. automodule:: parakeet.utils.layer_tools
:members:
:undoc-members:
:show-inheritance:
parakeet.utils.mp\_tools module
-------------------------------
.. automodule:: parakeet.utils.mp_tools
:members:
:undoc-members:
:show-inheritance:
parakeet.utils.scheduler module
-------------------------------
.. automodule:: parakeet.utils.scheduler
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: parakeet.utils
:members:
:undoc-members:
:show-inheritance:

112
docs/config_cn.md Normal file
View File

@ -0,0 +1,112 @@
# 实验配置
本节主要讲述 parakeet 的推荐的配置实验的方式,以及我们做出这样的选择的原因。
## 配置选项的内容
深度学习实验常常有很多选项可配置。这些配置大概可以被分为几类:
1. 数据源以及数据处理方式配置;
2. 实验结果保存路径配置;
3. 数据预处理方式配置;
4. 模型结构和超参数配置;
5. 训练过程配置。
虽然这些配置之间也可能存在某些重叠项,比如数据预处理部分的配置可能就和模型配置有关。比如说 mel 频谱的维数,既可以理解为模型配置的一部分,也可以理解为数据处理配置的一部分。但大体上,配置文件是可以分成几个部分的。
## 常见配置文件格式
常见的配置文件的格式有 `ini`, `yaml`, `toml`, `json` 等。
`ini`
优点:简单,支持字符串插值等操作。
缺点:仅支持两层结构,值不带类型信息,解析的时候需要手动 cast。
`yaml`
优点:格式简洁,值有类型,解析的时候一般不需手动 cast支持写注释。
缺点:语法规范复杂。
`toml`
和 yaml 类似
`json`
优点:格式简单,
缺点:标记符号太多,可读性不佳,手写也容易出错。不支持注释。
出于语言本身的表达能力和可读性,我们选择 yaml, 但我们会尽可能使配置文件简单。
1. 类型上,只使用字符串,整数,浮点数,布尔值;
2. 结构嵌套上,尽可能只使用两层或更浅的结构。
## 配置选项和命令行参数处理
对于深度学习实验,有部分配置是经常会发生改变的,比如数据源以及保存实验结果的路径,或者加载的 checkpoint 的路径等。对于这些配置,更好的做法是把它们实现为命令行参数。
其余的不经常发生变动的参数,推荐将其写在配置文件中,我们推荐使用 `yaml` 作为配置文件,因为它允许添加注释,并且更加人类可读。
当然把所有的选项都有 argparse 来处理也可以,但是对于选项丰富的深度学习实验来说,都使用 argparse 会导致代码异常冗长。
但是需要注意的是,同时使用配置文件和命令行解析工具的时候,如果不做特殊处理,配置文件所支持的选项并不能显示在 argparse.ArgumentParser 的 usage 和 help 信息里。这主要是配置文件解析和 argparse 在设计上的一些固有的差异导致的。
通过一些手段把配置所支持的选项附加到 ArgumentParser 固然可以弥补这点,但是这会存在一些默认值的优先级哪一方更高的问题,是默认配置的优先级更高,比如还是 ArgumentParser 中的默认值优先级更高。
因此我们选择不把配置所支持的选项附加到 ArgumentParser而是分开处理两部分。
## 实践
我们选择 yacs 搭配 argparse 作为配置解析工具,为 argparse 命令行新增一个选项 `--config` 来传入配置文件。yacs 有几个特点:
1. 支持 yaml 格式的配置文件(亦即支持配置层级嵌套以及有类型的值);
2. 支持 config 的增量覆盖,以及由命令行参数覆盖配置文件等灵活的操作;
3. 支持 `.key` 递归访问属性,比字典式的 `["key"]` 方便;
我们推荐把默认的配置写成 python 代码examples 中的每个例子都有一个 config.py里面提供了默认的配置并且带有注释。而如果用户需要覆盖部分配置则仅需要提供想要覆盖的部分配置即可而不必提供一个完整的配置文件。这么做的考虑是
1. 仅提供需要覆盖的选项也是许多软件配置的标准方式。
2. 对于同一个模型的两次实验,往往仅仅只有很少的配置发生变化,仅提供增量的配置比提供完整的配置更容易让用户看出两次实验的配置差异。
3. 运行脚本的时候可以不传 `--config` 参数,而以默认配置运行实验,简化运行脚本。
当新增实验的时候,可以参考 examples 里的例子来写默认配置文件。
除了可以通过 `--config` 命令行参数来指定用于覆盖的配置文件。另外,我们还可以通过新增一个 `--opts` 选项来接收 ArgumentParser 解析到的剩余命令行参数。这些参数将被用于进一步覆盖配置。使用方式是 `--opts key1 value1 key2 value2 ...`,即以空格分割键和值,比如`--opts training.lr 0.001 model.encoder_layers 4`。其中的键是配置中的键名,对于嵌套的选项,其键名以 `.` 连接。
## 默认的 ArgumentParser
我们提供了默认的 ArgumentParser参考 `parakeet/training/cli.py`, 它实现了上述的功能。它包含极简的命令行选项,只有 `--config`, `--data`, `--output`, `--checkpoint_path`, `--device`, `--nprocs``--opts` 选项。
这是一个深度学习基本都需要的一些命令行选项,因此当新增实验的时候,可以直接使用这个 ArgumentParser当有超出这个范围的命令行选项时也可以再继续新增。
1. `--config``--opts` 用于支持配置文件解析,而配置文件本身处理了每个实验特有的选项;
2. `--data``--output` 分别是数据集的路径和训练结果的保存路径(包含 checkpoints/ 文件夹,文本输出结果以及可视化输出结果);
3. `--checkpoint_path` 用于在训练前加载某个 checkpoint, 当需要从某个特定的 checkpoint 加载继续训练。另外,在不传 `--checkpoint_path` 的情况下,如果 `--output` 下的 checkpoints/ 文件夹中包含了训练的结果,则默认会加载其中最新的 checkpoint 继续训练。
4. `--device``--nprocs` 指定了运行方式,`--device` 指定运行设备类型,是在 cpu 还是 gpu 上运行。`--nprocs` 指的是用多少个进程训练,如果 `nprocs` > 1 则意味着使用多进程并行训练。(注:目前只支持 gpu 多卡多进程训练。)
使用帮助信息如下:
```text
usage: train.py [-h] [--config FILE] [--data DATA_DIR] [--output OUTPUT_DIR]
[--checkpoint_path CHECKPOINT_PATH] [--device {cpu,gpu}]
[--nprocs NPROCS] [--opts ...]
optional arguments:
-h, --help show this help message and exit
--config FILE path of the config file to overwrite to default config
with.
--data DATA_DIR path to the datatset.
--output OUTPUT_DIR path to save checkpoint and log. If not provided, a
directory is created in runs/ to save outputs.
--checkpoint_path CHECKPOINT_PATH
path of the checkpoint to load
--device {cpu,gpu} device type to use, cpu and gpu are supported.
--nprocs NPROCS number of parallel processes to use.
--opts ... options to overwrite --config file and the default
config, passing in KEY VALUE pairs
```

75
docs/experiment_cn.md Normal file
View File

@ -0,0 +1,75 @@
# 实验流程
实验中有不少细节需要注意,比如模型的保存和加载,定期进行验证,文本 log 和 可视化 log保存配置文件等另外对于不同的运行方式还有额外的处理这些代码可能比较繁琐但是对于追踪代码变化对结果的影响以及 debug 都非常重要。为了减少写这部分代码的成本,我们提供了不少通用的辅助代码,比如用于保存和加载,以及可视化的代码,可供实验代码直接使用。
而对于整个实验过程,我们提供了一个 ExperimentBase 类,它是在模型和实验开发的过程抽象出来的训练过程模板,可以作为具体实验的基类使用。相比 chainer 中的 Trainer 以及 keras 中的 Model.fit 而言ExperimentBase 是一个相对低层级的 API。它是作为基类来使用用户仍然需要实现整个训练过程也因此可以自由控制许多东西而不是作为一种组合方式来使用用户只需要提供模型数据集评价指标等就能自动完成整个训练过程。
前者的方式并不能节省很多代码量,只是以一种标准化的方式来组织代码。后者的方式虽然能够节省许多代码量,但是把如何组成整个训练过程的方式对用户隐藏了。如果需要为标准的训练过程添加一些自定义行为,则必须通过 extension/hook 等方式来实现,在一些固定的时点加入一些自定义行为(比如 iteration 开始、结束时epoch 开始、结束时,整个训练流程开始、结束时)。
通过 extension/hook 之类的方式来为训练流程加入自定义行为,往往存在一些 access 的限制。extension/hook 一般是通过 callable 的形式来实现,但是这个 callable 可访问的变量往往是有限的,比如说只能访问 model, optimzier, dataloader, iteration, epoch, metric 等,如果需要访问其他的中间变量,则往往比较麻烦。
此外组合式的使用方式往往对几个组件之间传输数据的协议有一些预设。一个常见的预设是dataloader 产生的 batch 即是 model 的输入。在简单的情况下,这样大抵是没有问题的,但是也存在一些可能,模型需要除了 batch 之外的输入。令一个常见的预设是criterion 仅需要 model 的 input 和 output 就能计算 loss, 但这么做其实存在 overkill 的可能,某些情况下,不需要 input 和 output 的全部字段就能计算 loss如果为了满足协议而把 criterion 的接口设计成一样的,存在输出不必要的参数的问题。
## ExperimentBase 的设计
因此我们选择了低层次的接口,用户仍然可以自由操作训练过程,而只是对训练过程做了粗粒度的抽象。可以参考 [ExperimentBase](parakeet/training/experiment.py) 的代码。
继承 ExperimentBase 写作自己的实验类的时候,需要遵循一下的一些规范:
1. 包含 `.model`, `.optimizer`, `.train_loader`, `.valid_loader`, `.config`, `.args` 等属性。
2. 配置需要包含一个 `.training` 字段, 其中包含 `valid_interval`, `save_interval``max_iteration` 几个键. 它们被用作触发验证,保存 checkpoint 以及停止训练的条件。
3. 需要实现四个方法 `train_batch`, `valid`, `setup_model` and `setup_dataloader`。`train_batch` 是在一个 batch 的过程,`valid` 是在整个验证数据集上执行一次验证的过程,`setup_model` 是初始化 model 和 optimizer 的过程,其他的模型构建相关的代码也可以放在这里,`setup_dataloader` 是 train_loader 和 valid_loader 的构建过程。
实验的初始化过程如下, 包含了创建模型优化器数据迭代器准备输出目录logger 和可视化,保存配置的工作,除了 `setup_dataloader``self.setup_model` 需要自行实现,其他的几个方法都已有标准的实现。
```python
def __init__(self, config, args):
self.config = config
self.args = args
def setup(self):
paddle.set_device(self.args.device)
if self.parallel:
self.init_parallel()
self.setup_output_dir()
self.dump_config()
self.setup_visualizer()
self.setup_logger()
self.setup_checkpointer()
self.setup_dataloader()
self.setup_model()
self.iteration = 0
self.epoch = 0
```
使用的时候只要一下的代码即可配置好一次实验:
```python
exp = Experiment(config, args)
exp.setup()
```
整个训练流程可以表示如下:
```python
def train(self):
self.new_epoch()
while self.iteration < self.config.training.max_iteration:
self.iteration += 1
self.train_batch()
if self.iteration % self.config.training.valid_interval == 0:
self.valid()
if self.iteration % self.config.training.save_interval == 0:
self.save()
```
使用时只需要执行如下代码即可开始实验。
```python
exp.run()
```

View File

@ -37,6 +37,10 @@ Dataset --(transform)--> Dataset --+
当开发新的模型的时候,开发这需要考虑拆分模块的可行性,以及模块的通用程度,把它们分置于合适的目录。
## 配置实验
我们使用 yacs 和 argparse 分别处理配置文件解析和命令行参数解析。关于配置的推荐方式,参考 [实验配置](./config_cn.md).
## 训练流程
训练流程一般就是多次训练一个循环体。典型的循环体包含如下的过程:
@ -46,34 +50,27 @@ Dataset --(transform)--> Dataset --+
3. 神经网络的 forward/backward 计算;
4. 参数更新;
5. 符合一定条件时,在验证数据集上评估模型;
6. 写日志,可视化,保存中间结果;
6. 写日志,可视化,以及在某些情况下保存必要的中间结果;
7. 保存模型和优化器的状态。
`数据处理` 一节包含了 1 和 2, 模型和优化器包含了 3 和 4. 那么 5,6,7 是训练流程主要要完成的事情。为了使训练循环体简洁清晰,推荐将模型的保存和加载,模型评估,写日志以及可视化等功能都实现成函数,尽管很多情况下,它们可能需要访问很多局部变量。我们也正在考虑使用一个 Experiment 或者 Trainer 类来规范化这些训练循环体的写法。这样可以把一些需要被许多函数访问的变量作为类内的变量,可以使代码简洁而不至于引入太多的全局变量。
`数据处理` 包含了数据集以及 batch_function 的定义, 模型和优化器包含了模型的 forward/backward 计算的定义。而在模型和数据都准备好了,我们需要把这些组织起来,完成实验代码。
训练流程的组装,可以参考 [实验流程](./experiment_cn.md).
## 实验模板
实验代码一般以如下的方式组织:
```text
├── configs/ (实验配置)
├── data.py (Dataset, DataLoader 等的定义)
├── README.md (实验的帮助信息)
├── config.py (默认配置)
├── preprocess.py (数据预处理脚本)
├── data.py (Dataset, batch_function 等的定义)
├── synthesis.py (用于生成的代码)
├── train.py (用于训练的代码)
└── utils.py (其他必要的辅助函数)
```
## 配置实验
深度学习实验常常有很多选项可配置。这些配置大概可以被分为几类:
1. 数据源以及数据处理方式配置;
2. 实验结果保存路径配置;
3. 数据预处理方式配置;
4. 模型结构和超参数配置;
5. 训练过程配置。
这些配置之间也可能存在某些重叠项,比如数据预处理部分的配置可能就和模型配置有关。比如说 mel 频谱的维数。
有部分配置是经常会发生改变的,比如数据源以及保存实验结果的路径,或者加载的 checkpoint 的路径等。对于这些配置,更好的做法是把它们实现为命令行参数。其余的不经常发生变动的参数,推荐将其写在配置文件中,我们推荐使用 `yaml` 作为配置文件,因为它允许添加注释,并且更加人类可读。
在这个软件源中包含了几个例子,可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户,可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。

View File

@ -12,6 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = "0.2.0"
__version__ = "0.2.0-beta"
from parakeet import audio, data, datasets, frontend, models, modules, training, utils

View File

@ -1,36 +0,0 @@
import parakeet
if __name__ == '__main__':
import argparse
import os
import shutil
from pathlib import Path
package_path = Path(__file__).parent
print(package_path)
parser = argparse.ArgumentParser()
subparser = parser.add_subparsers(dest="cmd")
list_exp_parser = subparser.add_parser("list-examples")
clone = subparser.add_parser("clone-example")
clone.add_argument("experiment_name", type=str, help="experiment name")
args = parser.parse_args()
if args.cmd == "list-examples":
print(os.listdir(package_path / "examples"))
exit(0)
if args.cmd == "clone-example":
source = package_path / "examples" / (args.experiment_name)
target = Path(os.getcwd()) / (args.experiment_name)
if not os.path.exists(str(source)):
raise ValueError("{} does not exist".format(str(source)))
if os.path.exists(str(target)):
raise FileExistsError("{} already exists".format(str(target)))
shutil.copytree(str(source), str(target))
print("{} copied!".format(args.experiment_name))
exit(0)

View File

@ -19,6 +19,8 @@ from parakeet.frontend.normalizer.numbers import normalize_numbers
def normalize(sentence):
""" Normalize English text.
"""
# preprocessing
sentence = unicode(sentence)
sentence = normalize_numbers(sentence)

View File

@ -75,6 +75,8 @@ def _expand_number(m):
def normalize_numbers(text):
""" Normalize numbers in English text.
"""
text = re.sub(_comma_number_re, _remove_commas, text)
text = re.sub(_pounds_re, r'\1 pounds', text)
text = re.sub(_dollars_re, _expand_dollars, text)

View File

@ -39,6 +39,9 @@ class Phonetics(ABC):
class English(Phonetics):
""" Normalize the input text sequence and convert into pronunciation id sequence.
"""
def __init__(self):
self.backend = G2p()
self.phonemes = list(self.backend.phonemes)
@ -46,6 +49,18 @@ class English(Phonetics):
self.vocab = Vocab(self.phonemes + self.punctuations)
def phoneticize(self, sentence):
""" Normalize the input text sequence and convert it into pronunciation sequence.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
List[str]
The list of pronunciation sequence.
"""
start = self.vocab.start_symbol
end = self.vocab.end_symbol
phonemes = ([] if start is None else [start]) \
@ -54,6 +69,18 @@ class English(Phonetics):
return phonemes
def numericalize(self, phonemes):
""" Convert pronunciation sequence into pronunciation id sequence.
Parameters
-----------
phonemes: List[str]
The list of pronunciation sequence.
Returns
----------
List[int]
The list of pronunciation id sequence.
"""
ids = [
self.vocab.lookup(item) for item in phonemes
if item in self.vocab.stoi
@ -61,17 +88,46 @@ class English(Phonetics):
return ids
def reverse(self, ids):
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
Parameters
-----------
ids: List[int]
The list of pronunciation id sequence.
Returns
----------
List[str]
The list of pronunciation sequence.
"""
return [self.vocab.reverse(i) for i in ids]
def __call__(self, sentence):
""" Convert the input text sequence into pronunciation id sequence.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
List[str]
The list of pronunciation id sequence.
"""
return self.numericalize(self.phoneticize(sentence))
@property
def vocab_size(self):
""" Vocab size.
"""
return len(self.vocab)
class EnglishCharacter(Phonetics):
""" Normalize the input text sequence and convert it into character id sequence.
"""
def __init__(self):
self.backend = G2p()
self.graphemes = list(self.backend.graphemes)
@ -79,10 +135,34 @@ class EnglishCharacter(Phonetics):
self.vocab = Vocab(self.graphemes + self.punctuations)
def phoneticize(self, sentence):
""" Normalize the input text sequence.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
str
A text sequence after normalize.
"""
words = normalize(sentence)
return words
def numericalize(self, sentence):
""" Convert a text sequence into ids.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
List[int]
List of a character id sequence.
"""
ids = [
self.vocab.lookup(item) for item in sentence
if item in self.vocab.stoi
@ -90,17 +170,46 @@ class EnglishCharacter(Phonetics):
return ids
def reverse(self, ids):
""" Convert a character id sequence into text.
Parameters
-----------
ids: List[int]
List of a character id sequence.
Returns
----------
str
The input text sequence.
"""
return [self.vocab.reverse(i) for i in ids]
def __call__(self, sentence):
""" Normalize the input text sequence and convert it into character id sequence.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
List[int]
List of a character id sequence.
"""
return self.numericalize(self.phoneticize(sentence))
@property
def vocab_size(self):
""" Vocab size.
"""
return len(self.vocab)
class Chinese(Phonetics):
"""Normalize Chinese text sequence and convert it into ids.
"""
def __init__(self):
self.opencc_backend = OpenCC('t2s.json')
self.backend = G2pM()
@ -115,6 +224,18 @@ class Chinese(Phonetics):
return list(all_syllables)
def phoneticize(self, sentence):
""" Normalize the input text sequence and convert it into pronunciation sequence.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
List[str]
The list of pronunciation sequence.
"""
simplified = self.opencc_backend.convert(sentence)
phonemes = self.backend(simplified)
start = self.vocab.start_symbol
@ -136,15 +257,53 @@ class Chinese(Phonetics):
return cleaned_phonemes
def numericalize(self, phonemes):
""" Convert pronunciation sequence into pronunciation id sequence.
Parameters
-----------
phonemes: List[str]
The list of pronunciation sequence.
Returns
----------
List[int]
The list of pronunciation id sequence.
"""
ids = [self.vocab.lookup(item) for item in phonemes]
return ids
def __call__(self, sentence):
""" Convert the input text sequence into pronunciation id sequence.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
List[str]
The list of pronunciation id sequence.
"""
return self.numericalize(self.phoneticize(sentence))
@property
def vocab_size(self):
""" Vocab size.
"""
return len(self.vocab)
def reverse(self, ids):
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
Parameters
-----------
ids: List[int]
The list of pronunciation id sequence.
Returns
----------
List[str]
The list of pronunciation sequence.
"""
return [self.vocab.reverse(i) for i in ids]

View File

@ -1,13 +1,46 @@
from typing import Dict, Iterable, List
from ruamel import yaml
from collections import OrderedDict
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict, Iterable, List
from collections import OrderedDict
__all__ = ["Vocab"]
class Vocab(object):
def __init__(self, symbols: Iterable[str],
""" Vocabulary.
Parameters
-----------
symbols: Iterable[str]
Common symbols.
padding_symbol: str, optional
Symbol for pad. Defaults to "<pad>".
unk_symbol: str, optional
Symbol for unknow. Defaults to "<unk>"
start_symbol: str, optional
Symbol for start. Defaults to "<s>"
end_symbol: str, optional
Symbol for end. Defaults to "</s>"
"""
def __init__(self,
symbols: Iterable[str],
padding_symbol="<pad>",
unk_symbol="<unk>",
start_symbol="<s>",
@ -23,7 +56,6 @@ class Vocab(object):
self.start_symbol = start_symbol
self.end_symbol = end_symbol
self.stoi = OrderedDict()
self.stoi.update(self.special_symbols)
@ -37,23 +69,33 @@ class Vocab(object):
@property
def num_specials(self):
""" The number of special symbols.
"""
return len(self.special_symbols)
# special tokens
@property
def padding_index(self):
""" The index of padding symbol
"""
return self.stoi.get(self.padding_symbol, -1)
@property
def unk_index(self):
"""The index of unknow symbol.
"""
return self.stoi.get(self.unk_symbol, -1)
@property
def start_index(self):
"""The index of start symbol.
"""
return self.stoi.get(self.start_symbol, -1)
@property
def end_index(self):
""" The index of end symbol.
"""
return self.stoi.get(self.end_symbol, -1)
def __repr__(self):
@ -64,12 +106,18 @@ class Vocab(object):
return self.__repr__()
def lookup(self, symbol):
""" The index that symbol correspond.
"""
return self.stoi[symbol]
def reverse(self, index):
""" The symbol thar index cottespond.
"""
return self.itos[index]
def add_symbol(self, symbol):
""" Add a new symbol in vocab.
"""
if symbol in self.stoi:
return
N = len(self.stoi)
@ -77,6 +125,7 @@ class Vocab(object):
self.itos[N] = symbol
def add_symbols(self, symbols):
""" Add multiple symbols in vocab.
"""
for symbol in symbols:
self.add_symbol(symbol)

View File

@ -14,8 +14,9 @@
#from parakeet.models.clarinet import *
from parakeet.models.waveflow import *
#from parakeet.models.wavenet import *
from parakeet.models.wavenet import *
from parakeet.models.transformer_tts import *
#from parakeet.models.deepvoice3 import *
# from parakeet.models.fastspeech import *
from parakeet.models.tacotron2 import *

View File

@ -27,11 +27,29 @@ __all__ = ["Tacotron2", "Tacotron2Loss"]
class DecoderPreNet(nn.Layer):
"""Decoder prenet module for Tacotron2.
Parameters
----------
d_input: int
The input feature size.
d_hidden: int
The hidden size.
d_output: int
The output feature size.
dropout_rate: float
The droput probability.
"""
def __init__(self,
d_input: int,
d_hidden: int,
d_output: int,
dropout_rate: float=0.2):
dropout_rate: float):
super().__init__()
self.dropout_rate = dropout_rate
@ -39,23 +57,59 @@ class DecoderPreNet(nn.Layer):
self.linear2 = nn.Linear(d_hidden, d_output, bias_attr=False)
def forward(self, x):
"""Calculate forward propagation.
Parameters
----------
x: Tensor [shape=(B, T_mel, C)]
Batch of the sequences of padded mel spectrogram.
Returns
-------
output: Tensor [shape=(B, T_mel, C)]
Batch of the sequences of padded hidden state.
"""
x = F.dropout(F.relu(self.linear1(x)), self.dropout_rate)
output = F.dropout(F.relu(self.linear2(x)), self.dropout_rate)
return output
class DecoderPostNet(nn.Layer):
"""Decoder postnet module for Tacotron2.
Parameters
----------
d_mels: int
The number of mel bands.
d_hidden: int
The hidden size of postnet.
kernel_size: int
The kernel size of the conv layer in postnet.
num_layers: int
The number of conv layers in postnet.
dropout: float
The droput probability.
"""
def __init__(self,
d_mels: int=80,
d_hidden: int=512,
kernel_size: int=5,
padding: int=0,
num_layers: int=5,
dropout: float=0.1):
d_mels: int,
d_hidden: int,
kernel_size: int,
num_layers: int,
dropout: float):
super().__init__()
self.dropout = dropout
self.num_layers = num_layers
padding = int((kernel_size - 1) / 2),
self.conv_batchnorms = nn.LayerList()
k = math.sqrt(1.0 / (d_mels * kernel_size))
self.conv_batchnorms.append(
@ -91,15 +145,46 @@ class DecoderPostNet(nn.Layer):
data_format='NLC'))
def forward(self, input):
"""Calculate forward propagation.
Parameters
----------
input: Tensor [shape=(B, T_mel, C)]
Output sequence of features from decoder.
Returns
-------
output: Tensor [shape=(B, T_mel, C)]
Output sequence of features after postnet.
"""
for i in range(len(self.conv_batchnorms) - 1):
input = F.dropout(
F.tanh(self.conv_batchnorms[i](input), self.dropout))
input = F.dropout(self.conv_batchnorms[self.num_layers - 1](input),
output = F.dropout(self.conv_batchnorms[self.num_layers - 1](input),
self.dropout)
return input
return output
class Tacotron2Encoder(nn.Layer):
"""Tacotron2 encoder module for Tacotron2.
Parameters
----------
d_hidden: int
The hidden size in encoder module.
conv_layers: int
The number of conv layers.
kernel_size: int
The kernel size of conv layers.
p_dropout: float
The droput probability.
"""
def __init__(self,
d_hidden: int,
conv_layers: int,
@ -126,6 +211,22 @@ class Tacotron2Encoder(nn.Layer):
d_hidden, self.hidden_size, direction="bidirectional")
def forward(self, x, input_lens=None):
"""Calculate forward propagation of tacotron2 encoder.
Parameters
----------
x: Tensor [shape=(B, T)]
Batch of the sequencees of padded character ids.
text_lens: Tensor [shape=(B,)], optional
Batch of lengths of each text input batch. Defaults to None.
Returns
-------
output : Tensor [shape=(B, T, C)]
Batch of the sequences of padded hidden states.
"""
for conv_batchnorm in self.conv_batchnorms:
x = F.dropout(F.relu(conv_batchnorm(x)),
self.p_dropout) #(B, T, C)
@ -135,6 +236,47 @@ class Tacotron2Encoder(nn.Layer):
class Tacotron2Decoder(nn.Layer):
"""Tacotron2 decoder module for Tacotron2.
Parameters
----------
d_mels: int
The number of mel bands.
reduction_factor: int
The reduction factor of tacotron.
d_encoder: int
The hidden size of encoder.
d_prenet: int
The hidden size in decoder prenet.
d_attention_rnn: int
The attention rnn layer hidden size.
d_decoder_rnn: int
The decoder rnn layer hidden size.
d_attention: int
The hidden size of the linear layer in location sensitive attention.
attention_filters: int
The filter size of the conv layer in location sensitive attention.
attention_kernel_size: int
The kernel size of the conv layer in location sensitive attention.
p_prenet_dropout: float
The droput probability in decoder prenet.
p_attention_dropout: float
The droput probability in location sensitive attention.
p_decoder_dropout: float
The droput probability in decoder.
"""
def __init__(self,
d_mels: int,
reduction_factor: int,
@ -175,6 +317,8 @@ class Tacotron2Decoder(nn.Layer):
self.stop_layer = nn.Linear(d_decoder_rnn + d_encoder, 1)
def _initialize_decoder_states(self, key):
"""init states be used in decoder
"""
batch_size = key.shape[0]
MAX_TIME = key.shape[1]
@ -199,6 +343,8 @@ class Tacotron2Decoder(nn.Layer):
self.processed_key = self.attention_layer.key_layer(key) #[B, T, C]
def _decode(self, query):
"""decode one time step
"""
cell_input = paddle.concat([query, self.attention_context], axis=-1)
# The first lstm layer
@ -232,6 +378,30 @@ class Tacotron2Decoder(nn.Layer):
return decoder_output, stop_logit, self.attention_weights
def forward(self, keys, querys, mask):
"""Calculate forward propagation of tacotron2 decoder.
Parameters
----------
keys: Tensor[shape=(B, T_key, C)]
Batch of the sequences of padded output from encoder.
querys: Tensor[shape(B, T_query, C)]
Batch of the sequences of padded mel spectrogram.
mask: Tensor
Mask generated with text length. Shape should be (B, T_key, T_query) or broadcastable shape.
Returns
-------
mel_output: Tensor [shape=(B, T_query, C)]
Output sequence of features.
stop_logits: Tensor [shape=(B, T_query)]
Output sequence of stop logits.
alignments: Tensor [shape=(B, T_query, T_key)]
Attention weights.
"""
querys = paddle.reshape(
querys,
[querys.shape[0], querys.shape[1] // self.reduction_factor, -1])
@ -263,6 +433,31 @@ class Tacotron2Decoder(nn.Layer):
return mel_outputs, stop_logits, alignments
def infer(self, key, stop_threshold=0.5, max_decoder_steps=1000):
"""Calculate forward propagation of tacotron2 decoder.
Parameters
----------
keys: Tensor [shape=(B, T_key, C)]
Batch of the sequences of padded output from encoder.
stop_threshold: float, optional
Stop synthesize when stop logit is greater than this stop threshold. Defaults to 0.5.
max_decoder_steps: int, optional
Number of max step when synthesize. Defaults to 1000.
Returns
-------
mel_output: Tensor [shape=(B, T_mel, C)]
Output sequence of features.
stop_logits: Tensor [shape=(B, T_mel)]
Output sequence of stop logits.
alignments: Tensor [shape=(B, T_mel, T_key)]
Attention weights.
"""
query = paddle.zeros(
shape=[key.shape[0], self.d_mels * self.reduction_factor],
dtype=key.dtype) #[B, C]
@ -295,17 +490,76 @@ class Tacotron2Decoder(nn.Layer):
class Tacotron2(nn.Layer):
"""
Tacotron2 module for end-to-end text-to-speech (E2E-TTS).
"""Tacotron2 model for end-to-end text-to-speech (E2E-TTS).
This is a module of Spectrogram prediction network in Tacotron2 described
in `Natural TTS Synthesis
by Conditioning WaveNet on Mel Spectrogram Predictions`_,
This is a model of Spectrogram prediction network in Tacotron2 described
in `Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions
<https://arxiv.org/abs/1712.05884>`_,
which converts the sequence of characters
into the sequence of mel spectrogram.
.. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
https://arxiv.org/abs/1712.05884
Parameters
----------
frontend : parakeet.frontend.Phonetics
Frontend used to preprocess text.
d_mels: int
Number of mel bands.
d_encoder: int
Hidden size in encoder module.
encoder_conv_layers: int
Number of conv layers in encoder.
encoder_kernel_size: int
Kernel size of conv layers in encoder.
d_prenet: int
Hidden size in decoder prenet.
d_attention_rnn: int
Attention rnn layer hidden size in decoder.
d_decoder_rnn: int
Decoder rnn layer hidden size in decoder.
attention_filters: int
Filter size of the conv layer in location sensitive attention.
attention_kernel_size: int
Kernel size of the conv layer in location sensitive attention.
d_attention: int
Hidden size of the linear layer in location sensitive attention.
d_postnet: int
Hidden size of postnet.
postnet_kernel_size: int
Kernel size of the conv layer in postnet.
postnet_conv_layers: int
Number of conv layers in postnet.
reduction_factor: int
Reduction factor of tacotron2.
p_encoder_dropout: float
Droput probability in encoder.
p_prenet_dropout: float
Droput probability in decoder prenet.
p_attention_dropout: float
Droput probability in location sensitive attention.
p_decoder_dropout: float
Droput probability in decoder.
p_postnet_dropout: float
Droput probability in postnet.
"""
def __init__(self,
@ -350,11 +604,38 @@ class Tacotron2(nn.Layer):
d_mels=d_mels * reduction_factor,
d_hidden=d_postnet,
kernel_size=postnet_kernel_size,
padding=int((postnet_kernel_size - 1) / 2),
num_layers=postnet_conv_layers,
dropout=p_postnet_dropout)
def forward(self, text_inputs, mels, text_lens, output_lens=None):
"""Calculate forward propagation of tacotron2.
Parameters
----------
text_inputs: Tensor [shape=(B, T_text)]
Batch of the sequencees of padded character ids.
mels: Tensor [shape(B, T_mel, C)]
Batch of the sequences of padded mel spectrogram.
text_lens: Tensor [shape=(B,)]
Batch of lengths of each text input batch.
output_lens: Tensor [shape=(B,)], optional
Batch of lengths of each mels batch. Defaults to None.
Returns
-------
outputs : Dict[str, Tensor]
mel_output: output sequence of features (B, T_mel, C);
mel_outputs_postnet: output sequence of features after postnet (B, T_mel, C);
stop_logits: output sequence of stop logits (B, T_mel);
alignments: attention weights (B, T_mel, T_text).
"""
embedded_inputs = self.embedding(text_inputs)
encoder_outputs = self.encoder(embedded_inputs, text_lens)
@ -386,6 +667,31 @@ class Tacotron2(nn.Layer):
@paddle.no_grad()
def infer(self, text_inputs, stop_threshold=0.5, max_decoder_steps=1000):
"""Generate the mel sepctrogram of features given the sequences of character ids.
Parameters
----------
text_inputs: Tensor [shape=(B, T_text)]
Batch of the sequencees of padded character ids.
stop_threshold: float, optional
Stop synthesize when stop logit is greater than this stop threshold. Defaults to 0.5.
max_decoder_steps: int, optional
Number of max step when synthesize. Defaults to 1000.
Returns
-------
outputs : Dict[str, Tensor]
mel_output: output sequence of sepctrogram (B, T_mel, C);
mel_outputs_postnet: output sequence of sepctrogram after postnet (B, T_mel, C);
stop_logits: output sequence of stop logits (B, T_mel);
alignments: attention weights (B, T_mel, T_text).
"""
embedded_inputs = self.embedding(text_inputs)
encoder_outputs = self.encoder(embedded_inputs)
mel_outputs, stop_logits, alignments = self.decoder.infer(
@ -407,7 +713,27 @@ class Tacotron2(nn.Layer):
@paddle.no_grad()
def predict(self, text, stop_threshold=0.5, max_decoder_steps=1000):
# TODO(lifuchen): implement predict function to product mel from texts
"""Generate the mel sepctrogram of features given the sequenc of characters.
Parameters
----------
text: str
Sequence of characters.
stop_threshold: float, optional
Stop synthesize when stop logit is greater than this stop threshold. Defaults to 0.5.
max_decoder_steps: int, optional
Number of max step when synthesize. Defaults to 1000.
Returns
-------
outputs : Dict[str, Tensor]
mel_outputs_postnet: output sequence of sepctrogram after postnet (T_mel, C);
alignments: attention weights (T_mel, T_text).
"""
ids = np.asarray(self.frontend(text))
ids = paddle.unsqueeze(paddle.to_tensor(ids, dtype='int64'), [0])
outputs = self.infer(ids, stop_threshold, max_decoder_steps)
@ -416,6 +742,24 @@ class Tacotron2(nn.Layer):
@classmethod
def from_pretrained(cls, frontend, config, checkpoint_path):
"""Build a tacotron2 model from a pretrained model.
Parameters
----------
frontend: parakeet.frontend.Phonetics
Frontend used to preprocess text.
config: yacs.config.CfgNode
Model configs.
checkpoint_path: Path or str
The path of pretrained model checkpoint, without extension name.
Returns
-------
Tacotron2
The model build from pretrined result.
"""
model = cls(frontend,
d_mels=config.data.d_mels,
d_encoder=config.model.d_encoder,
@ -442,11 +786,45 @@ class Tacotron2(nn.Layer):
class Tacotron2Loss(nn.Layer):
""" Tacotron2 Loss module
"""
def __init__(self):
super().__init__()
def forward(self, mel_outputs, mel_outputs_postnet, stop_logits,
mel_targets, stop_tokens):
"""Calculate tacotron2 loss.
Parameters
----------
mel_outputs: Tensor [shape=(B, T_mel, C)]
Output mel spectrogram sequence.
mel_outputs_postnet: Tensor [shape(B, T_mel, C)]
Output mel spectrogram sequence after postnet.
stop_logits: Tensor [shape=(B, T_mel)]
Output sequence of stop logits befor sigmoid.
mel_targets: Tensor [shape=(B, T_mel, C)]
Target mel spectrogram sequence.
stop_tokens: Tensor [shape=(B,)]
Target stop token.
Returns
-------
losses : Dict[str, Tensor]
loss: the sum of the other three losses;
mel_loss: MSE loss compute by mel_targets and mel_outputs;
post_mel_loss: MSE loss compute by mel_targets and mel_outputs_postnet;
stop_loss: stop loss computed by stop_logits and stop token.
"""
mel_loss = paddle.nn.MSELoss()(mel_outputs, mel_targets)
post_mel_loss = paddle.nn.MSELoss()(mel_outputs_postnet, mel_targets)
stop_loss = paddle.nn.BCEWithLogitsLoss()(stop_logits, stop_tokens)

View File

@ -33,8 +33,7 @@ __all__ = ["TransformerTTS", "TransformerTTSLoss"]
# Transformer TTS's own implementation of transformer
class MultiheadAttention(nn.Layer):
"""
Multihead scaled dot product attention with drop head. See
"""Multihead scaled dot product attention with drop head. See
[Scheduled DropHead: A Regularization Method for Transformer Models](https://arxiv.org/abs/2004.13342)
for details.

View File

@ -30,15 +30,26 @@ from parakeet.utils import checkpoint, layer_tools
def crop(x, audio_start, audio_length):
"""Crop the upsampled condition to match audio_length. The upsampled condition has the same time steps as the whole audio does. But since audios are sliced to 0.5 seconds randomly while conditions are not, upsampled conditions should also be sliced to extaclt match the time steps of the audio slice.
"""Crop the upsampled condition to match audio_length.
Args:
x (Tensor): shape(B, C, T), dtype float32, the upsample condition.
audio_start (Tensor): shape(B, ), dtype: int64, the index the starting point.
audio_length (int): the length of the audio (number of samples it contaions).
The upsampled condition has the same time steps as the whole audio does.
But since audios are sliced to 0.5 seconds randomly while conditions are
not, upsampled conditions should also be sliced to extactly match the time
steps of the audio slice.
Returns:
Tensor: shape(B, C, audio_length), cropped condition.
Parameters
----------
x : Tensor [shape=(B, C, T)]
The upsampled condition.
audio_start : Tensor [shape=(B,), dtype:int]
The index of the starting point of the audio clips.
audio_length : int
The length of the audio clip(number of samples it contaions).
Returns
-------
Tensor [shape=(B, C, audio_length)]
Cropped condition.
"""
# crop audio
slices = [] # for each example
@ -54,15 +65,37 @@ def crop(x, audio_start, audio_length):
class UpsampleNet(nn.LayerList):
def __init__(self, upscale_factors=[16, 16]):
"""UpsamplingNet.
It consists of several layers of Conv2DTranspose. Each Conv2DTranspose layer upsamples the time dimension by its `stride` times. And each Conv2DTranspose's filter_size at frequency dimension is 3.
"""A network used to upsample mel spectrogram to match the time steps of
audio.
Args:
upscale_factors (list[int], optional): time upsampling factors for each Conv2DTranspose Layer. The `UpsampleNet` contains len(upscale_factor) Conv2DTranspose Layers. Each upscale_factor is used as the `stride` for the corresponding Conv2DTranspose. Defaults to [16, 16].
Note:
np.prod(upscale_factors) should equals the `hop_length` of the stft transformation used to extract spectrogram features from audios. For example, 16 * 16 = 256, then the spectram extracted using a stft transformation whose `hop_length` is 256. See `librosa.stft` for more details.
It consists of several layers of Conv2DTranspose. Each Conv2DTranspose
layer upsamples the time dimension by its `stride` times.
Also, each Conv2DTranspose's filter_size at frequency dimension is 3.
Parameters
----------
upscale_factors : List[int], optional
Time upsampling factors for each Conv2DTranspose Layer.
The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose
Layers. Each upscale_factor is used as the ``stride`` for the
corresponding Conv2DTranspose. Defaults to [16, 16], this the default
upsampling factor is 256.
Notes
------
``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft
transformation used to extract spectrogram features from audio.
For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft
transformation whose ``hop_length`` equals 256 is suitable.
See Also
---------
``librosa.core.stft``
"""
def __init__(self, upscale_factors=[16, 16]):
super(UpsampleNet, self).__init__()
self.upscale_factors = list(upscale_factors)
self.upscale_factor = 1
@ -78,13 +111,20 @@ class UpsampleNet(nn.LayerList):
padding=(1, factor // 2))))
def forward(self, x):
"""Compute the upsampled condition.
r"""Compute the upsampled condition.
Args:
x (Tensor): shape(B, F, T), dtype float32, the condition (mel spectrogram here.) (F means the frequency bands). In the internal Conv2DTransposes, the frequency dimension is treated as `height` dimension instead of `in_channels`.
Parameters
-----------
x : Tensor [shape=(B, F, T)]
The condition (mel spectrogram here). ``F`` means the frequency
bands, which is the feature size of the input.
In the internal Conv2DTransposes, the frequency dimension
is treated as ``height`` dimension instead of ``in_channels``.
Returns:
Tensor: shape(B, F, T * upscale_factor), dtype float32, the upsampled condition.
Tensor [shape=(B, F, T \* upscale_factor)]
The upsampled condition.
"""
x = paddle.unsqueeze(x, 1)
for sublayer in self:
@ -94,19 +134,36 @@ class UpsampleNet(nn.LayerList):
class ResidualBlock(nn.Layer):
"""A Residual block used in wavenet. Conv1D-gated-tanh Block.
It consists of a Conv1DCell and an Conv1D(kernel_size = 1) to integrate
information of the condition.
Notes
--------
It does not have parametric residual or skip connection.
Parameters
-----------
residual_channels : int
The feature size of the input. It is also the feature size of the
residual output and skip output.
condition_dim : int
The feature size of the condition.
filter_size : int
Kernel size of the internal convolution cells.
dilation :int
Dilation of the internal convolution cells.
"""
def __init__(self,
residual_channels: int,
condition_dim: int,
filter_size: Union[int, Sequence[int]],
dilation: int):
"""A Residual block in wavenet. It does not have parametric residual or skip connection. It consists of a Conv1DCell and an Conv1D(filter_size = 1) to integrate the condition.
Args:
residual_channels (int): the channels of the input, residual and skip.
condition_dim (int): the channels of the condition.
filter_size (int): filter size of the internal convolution cell.
dilation (int): dilation of the internal convolution cell.
"""
super(ResidualBlock, self).__init__()
dilated_channels = 2 * residual_channels
# following clarinet's implementation, we do not have parametric residual
@ -133,16 +190,28 @@ class ResidualBlock(nn.Layer):
self.condition_dim = condition_dim
def forward(self, x, condition=None):
"""Conv1D gated-tanh Block.
"""Forward pass of the ResidualBlock.
Args:
x (Tensor): shape(B, C_res, T), the input. (B stands for batch_size, C_res stands for residual channels, T stands for time steps.) dtype float32.
condition (Tensor, optional): shape(B, C_cond, T), the condition, it has been upsampled in time steps, so it has the same time steps as the input does.(C_cond stands for the condition's channels). Defaults to None.
Parameters
-----------
x : Tensor [shape=(B, C, T)]
The input tensor.
Returns:
(residual, skip_connection)
residual (Tensor): shape(B, C_res, T), the residual, which is used as the input to the next layer of ResidualBlock.
skip_connection (Tensor): shape(B, C_res, T), the skip connection. This output is accumulated with that of other ResidualBlocks.
condition : Tensor, optional [shape(B, C_cond, T)]
The condition.
It has been upsampled in time steps, so it has the same time steps
as the input does.(C_cond stands for the condition's channels).
Defaults to None.
Returns
-----------
residual : Tensor [shape=(B, C, T)]
The residual, which is used as the input to the next ResidualBlock.
skip_connection : Tensor [shape=(B, C, T)]
Tthe skip connection. This output is accumulated with that of
other ResidualBlocks.
"""
h = x
@ -163,22 +232,38 @@ class ResidualBlock(nn.Layer):
return residual, skip_connection
def start_sequence(self):
"""Prepare the ResidualBlock to generate a new sequence. This method should be called before starting calling `add_input` multiple times.
"""Prepare the ResidualBlock to generate a new sequence.
Warnings
---------
This method should be called before calling ``add_input`` multiple times.
"""
self.conv.start_sequence()
self.condition_proj.start_sequence()
def add_input(self, x, condition=None):
"""Add a step input. This method works similarily with `forward` but in a `step-in-step-out` fashion.
"""Take a step input and return a step output.
Args:
x (Tensor): shape(B, C_res), input for a step, dtype float32.
condition (Tensor, optional): shape(B, C_cond). condition for a step, dtype float32. Defaults to None.
This method works similarily with ``forward`` but in a
``step-in-step-out`` fashion.
Returns:
(residual, skip_connection)
residual (Tensor): shape(B, C_res), the residual for a step, which is used as the input to the next layer of ResidualBlock.
skip_connection (Tensor): shape(B, C_res), the skip connection for a step. This output is accumulated with that of other ResidualBlocks.
Parameters
----------
x : Tensor [shape=(B, C)]
Input for a step.
condition : Tensor, optional [shape=(B, C_cond)]
Condition for a step. Defaults to None.
Returns
----------
residual : Tensor [shape=(B, C)]
The residual for a step, which is used as the input to the next
layer of ResidualBlock.
skip_connection : Tensor [shape=(B, C)]
T he skip connection for a step. This output is accumulated with
that of other ResidualBlocks.
"""
h = x
@ -511,6 +596,54 @@ class WaveNet(nn.Layer):
class ConditionalWaveNet(nn.Layer):
r"""Conditional Wavenet. An implementation of
`WaveNet: A Generative Model for Raw Audio <http://arxiv.org/abs/1609.03499>`_.
It contains an UpsampleNet as the encoder and a WaveNet as the decoder.
It is an autoregressive model that generate raw audio.
Parameters
----------
upsample_factors : List[int]
The upsampling factors of the UpsampleNet.
n_stack : int
Number of convolution stacks in the WaveNet.
n_loop : int
Number of convolution layers in a convolution stack.
Convolution layers in a stack have exponentially growing dilations,
from 1 to .. math:: `k^{n_{loop} - 1}`, where k is the kernel size.
residual_channels : int
Feature size of each ResidualBlocks.
output_dim : int
Feature size of the output. See ``loss_type`` for details.
n_mels : int
The number of bands of mel spectrogram.
filter_size : int, optional
Convolution kernel size of each ResidualBlock, by default 2.
loss_type : str, optional ["mog" or "softmax"]
The output type and loss type of the model, by default "mog".
If "softmax", the model input should be quantized audio and the model
outputs a discret distribution.
If "mog", the model input is audio in floating point format, and the
model outputs parameters for a mixture of gaussian distributions.
Namely, the weight, mean and logscale of each gaussian distribution.
Thus, the ``output_size`` should be a multiple of 3.
log_scale_min : float, optional
Minimum value of the log probability density, by default -9.0.
This is only used for computing loss when ``loss_type`` is "mog", If the
"""
def __init__(self,
upsample_factors: List[int],
n_stack: int,
@ -521,8 +654,6 @@ class ConditionalWaveNet(nn.Layer):
filter_size: int=2,
loss_type: str="mog",
log_scale_min: float=-9.0):
"""Conditional Wavenet, which contains an UpsampleNet as the encoder and a WaveNet as the decoder. It is an autoregressive model.
"""
super(ConditionalWaveNet, self).__init__()
self.encoder = UpsampleNet(upsample_factors)
self.decoder = WaveNet(n_stack=n_stack,
@ -537,13 +668,23 @@ class ConditionalWaveNet(nn.Layer):
def forward(self, audio, mel, audio_start):
"""Compute the output distribution given the mel spectrogram and the input(for teacher force training).
Args:
audio (Tensor): shape(B, T_audio), dtype float32, ground truth waveform, used for teacher force training.
mel (Tensor): shape(B, F, T_mel), dtype float32, mel spectrogram. Note that it is the spectrogram for the whole utterance.
audio_start (Tensor): shape(B, ), dtype: int, audio slices' start positions for each utterance.
Parameters
-----------
audio : Tensor [shape=(B, T_audio)]
ground truth waveform, used for teacher force training.
Returns:
Tensor: shape(B, T_audio - 1, C_putput), parameters for the output distribution.(C_output is the `output_dim` of the decoder.)
mel : Tensor [shape(B, F, T_mel)]
Mel spectrogram. Note that it is the spectrogram for the whole
utterance.
audio_start : Tensor [shape=(B,), dtype: int]
Audio slices' start positions for each utterance.
Returns
----------
Tensor [shape(B, T_audio - 1, C_output)]
Parameters for the output distribution, where ``C_output`` is the
``output_dim`` of the decoder.)
"""
audio_length = audio.shape[1] # audio clip's length
condition = self.encoder(mel)
@ -557,14 +698,21 @@ class ConditionalWaveNet(nn.Layer):
return y
def loss(self, y, t):
"""compute loss with respect to the output distribution and the targer audio.
"""Compute loss with respect to the output distribution and the target
audio.
Args:
y (Tensor): shape(B, T - 1, C_output), dtype float32, parameters of the output distribution.
t (Tensor): shape(B, T), dtype float32, target waveform.
Parameters
-----------
y : Tensor [shape=(B, T - 1, C_output)]
Parameters of the output distribution.
Returns:
Tensor: shape(1, ), dtype float32, the loss.
t : Tensor [shape(B, T)]
target waveform.
Returns
--------
Tensor: [shape=(1,)]
the loss.
"""
t = t[:, 1:]
loss = self.decoder.loss(y, t)
@ -573,24 +721,35 @@ class ConditionalWaveNet(nn.Layer):
def sample(self, y):
"""Sample from the output distribution.
Args:
y (Tensor): shape(B, T, C_output), dtype float32, parameters of the output distribution.
Parameters
-----------
y : Tensor [shape=(B, T, C_output)]
Parameters of the output distribution.
Returns:
Tensor: shape(B, T), dtype float32, sampled waveform from the output distribution.
Returns
--------
Tensor [shape=(B, T)]
Sampled waveform from the output distribution.
"""
samples = self.decoder.sample(y)
return samples
@paddle.no_grad()
def infer(self, mel):
"""Synthesize waveform from mel spectrogram.
r"""Synthesize waveform from mel spectrogram.
Args:
mel (Tensor): shape(B, F, T), condition(mel spectrogram here).
Parameters
-----------
mel : Tensor [shape=(B, F, T)]
The ondition (mel spectrogram here).
Returns:
Tensor: shape(B, T * upsacle_factor), synthesized waveform.(`upscale_factor` is the `upscale_factor` of the encoder `UpsampleNet`)
Returns
-----------
Tensor [shape=(B, T \* upsacle_factor)]
Synthesized waveform.
``upscale_factor`` is the ``upscale_factor`` of the encoder
``UpsampleNet``.
"""
condition = self.encoder(mel)
batch_size, _, time_steps = condition.shape
@ -610,6 +769,20 @@ class ConditionalWaveNet(nn.Layer):
@paddle.no_grad()
def predict(self, mel):
r"""Synthesize audio from mel spectrogram.
The output and input are numpy arrays without batch.
Parameters
----------
mel : np.ndarray [shape=(C, T)]
Mel spectrogram of an utterance.
Returns
-------
Tensor : np.ndarray [shape=(C, T \* upsample_factor)]
The synthesized waveform of an utterance.
"""
mel = paddle.to_tensor(mel)
mel = paddle.unsqueeze(mel, 0)
audio = self.infer(mel)
@ -618,6 +791,21 @@ class ConditionalWaveNet(nn.Layer):
@classmethod
def from_pretrained(cls, config, checkpoint_path):
"""Build a ConditionalWaveNet model from a pretrained model.
Parameters
----------
config: yacs.config.CfgNode
model configs
checkpoint_path: Path or str
the path of pretrained model checkpoint, without extension name
Returns
-------
ConditionalWaveNet
The model built from pretrained result.
"""
model = cls(
upsample_factors=config.model.upsample_factors,
n_stack=config.model.n_stack,
@ -631,5 +819,3 @@ class ConditionalWaveNet(nn.Layer):
layer_tools.summary(model)
checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
return model

View File

@ -12,3 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.modules.attention import *
from parakeet.modules.audio import *
from parakeet.modules.conv import *
from parakeet.modules.geometry import *
from parakeet.modules.losses import *
from parakeet.modules.masking import *
from parakeet.modules.positional_encoding import *
from parakeet.modules.transformer import *

View File

@ -25,22 +25,34 @@ def scaled_dot_product_attention(q,
mask=None,
dropout=0.0,
training=True):
"""
scaled dot product attention with mask. Assume q, k, v all have the same
leader dimensions(denoted as * in descriptions below). Dropout is applied to
attention weights before weighted sum of values.
r"""Scaled dot product attention with masking.
Args:
q (Tensor): shape(*, T_q, d), the query tensor.
k (Tensor): shape(*, T_k, d), the key tensor.
v (Tensor): shape(*, T_k, d_v), the value tensor.
mask (Tensor, optional): shape(*, T_q, T_k) or broadcastable shape, the
mask tensor, 0 correspond to padding. Defaults to None.
Assume that q, k, v all have the same leading dimensions (denoted as * in
descriptions below). Dropout is applied to attention weights before
weighted sum of values.
Returns:
(out, attn_weights)
out (Tensor): shape(*, T_q, d_v), the context vector.
attn_weights (Tensor): shape(*, T_q, T_k), the attention weights.
Parameters
-----------
q : Tensor [shape=(\*, T_q, d)]
the query tensor.
k : Tensor [shape=(\*, T_k, d)]
the key tensor.
v : Tensor [shape=(\*, T_k, d_v)]
the value tensor.
mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
the mask tensor, zeros correspond to paddings. Defaults to None.
Returns
----------
out : Tensor [shape=(\*, T_q, d_v)]
the context vector.
attn_weights : Tensor [shape=(\*, T_q, T_k)]
the attention weights.
"""
d = q.shape[-1] # we only support imperative execution
qk = paddle.matmul(q, k, transpose_y=True)
@ -55,17 +67,25 @@ def scaled_dot_product_attention(q,
return out, attn_weights
def drop_head(x, drop_n_heads, training):
"""
Drop n heads from multiple context vectors.
def drop_head(x, drop_n_heads, training=True):
"""Drop n context vectors from multiple ones.
Args:
x (Tensor): shape(batch_size, num_heads, time_steps, channels), the input.
drop_n_heads (int): [description]
training ([type]): [description]
Parameters
----------
x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]
The input, multiple context vectors.
Returns:
[type]: [description]
drop_n_heads : int [0<= drop_n_heads <= num_heads]
Number of vectors to drop.
training : bool
A flag indicating whether it is in training. If `False`, no dropout is
applied.
Returns
-------
Tensor
The output.
"""
if not training or (drop_n_heads == 0):
return x
@ -101,21 +121,31 @@ def _concat_heads(x):
# Standard implementations of Monohead Attention & Multihead Attention
class MonoheadAttention(nn.Layer):
def __init__(self, model_dim, dropout=0.0, k_dim=None, v_dim=None):
"""
Monohead Attention module.
"""Monohead Attention module.
Args:
model_dim (int): the feature size of query.
dropout (float, optional): dropout probability of scaled dot product
attention and final context vector. Defaults to 0.0.
k_dim (int, optional): feature size of the key of each scaled dot
product attention. If not provided, it is set to
model_dim / num_heads. Defaults to None.
v_dim (int, optional): feature size of the key of each scaled dot
product attention. If not provided, it is set to
model_dim / num_heads. Defaults to None.
Parameters
----------
model_dim : int
Feature size of the query.
dropout : float, optional
Dropout probability of scaled dot product attention and final context
vector. Defaults to 0.0.
k_dim : int, optional
Feature size of the key of each scaled dot product attention. If not
provided, it is set to `model_dim / num_heads`. Defaults to None.
v_dim : int, optional
Feature size of the key of each scaled dot product attention. If not
provided, it is set to `model_dim / num_heads`. Defaults to None.
"""
def __init__(self,
model_dim: int,
dropout: float=0.0,
k_dim: int=None,
v_dim: int=None):
super(MonoheadAttention, self).__init__()
k_dim = k_dim or model_dim
v_dim = v_dim or model_dim
@ -128,20 +158,29 @@ class MonoheadAttention(nn.Layer):
self.dropout = dropout
def forward(self, q, k, v, mask):
"""
Compute context vector and attention weights.
"""Compute context vector and attention weights.
Args:
q (Tensor): shape(batch_size, time_steps_q, model_dim), the queries.
k (Tensor): shape(batch_size, time_steps_k, model_dim), the keys.
v (Tensor): shape(batch_size, time_steps_k, model_dim), the values.
mask (Tensor): shape(batch_size, times_steps_q, time_steps_k) or
broadcastable shape, dtype: float32 or float64, the mask.
Parameters
-----------
q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
The queries.
Returns:
(out, attention_weights)
out (Tensor), shape(batch_size, time_steps_q, model_dim), the context vector.
attention_weights (Tensor): shape(batch_size, times_steps_q, time_steps_k), the attention weights.
k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
The keys.
v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
The values.
mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
The mask.
Returns
----------
out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
The context vector.
attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
The attention weights.
"""
q = self.affine_q(q) # (B, T, C)
k = self.affine_k(k)
@ -155,34 +194,40 @@ class MonoheadAttention(nn.Layer):
class MultiheadAttention(nn.Layer):
"""
Multihead scaled dot product attention.
"""Multihead Attention module.
Parameters
-----------
model_dim: int
The feature size of query.
num_heads : int
The number of attention heads.
dropout : float, optional
Dropout probability of scaled dot product attention and final context
vector. Defaults to 0.0.
k_dim : int, optional
Feature size of the key of each scaled dot product attention. If not
provided, it is set to ``model_dim / num_heads``. Defaults to None.
v_dim : int, optional
Feature size of the key of each scaled dot product attention. If not
provided, it is set to ``model_dim / num_heads``. Defaults to None.
Raises
---------
ValueError
If ``model_dim`` is not divisible by ``num_heads``.
"""
def __init__(self,
model_dim,
num_heads,
dropout=0.0,
k_dim=None,
v_dim=None):
"""
Multihead Attention module.
Args:
model_dim (int): the feature size of query.
num_heads (int): the number of attention heads.
dropout (float, optional): dropout probability of scaled dot product
attention and final context vector. Defaults to 0.0.
k_dim (int, optional): feature size of the key of each scaled dot
product attention. If not provided, it is set to
model_dim / num_heads. Defaults to None.
v_dim (int, optional): feature size of the key of each scaled dot
product attention. If not provided, it is set to
model_dim / num_heads. Defaults to None.
Raises:
ValueError: if model_dim is not divisible by num_heads
"""
model_dim: int,
num_heads: int,
dropout: float=0.0,
k_dim: int=None,
v_dim: int=None):
super(MultiheadAttention, self).__init__()
if model_dim % num_heads != 0:
raise ValueError("model_dim must be divisible by num_heads")
@ -199,20 +244,29 @@ class MultiheadAttention(nn.Layer):
self.dropout = dropout
def forward(self, q, k, v, mask):
"""
Compute context vector and attention weights.
"""Compute context vector and attention weights.
Args:
q (Tensor): shape(batch_size, time_steps_q, model_dim), the queries.
k (Tensor): shape(batch_size, time_steps_k, model_dim), the keys.
v (Tensor): shape(batch_size, time_steps_k, model_dim), the values.
mask (Tensor): shape(batch_size, times_steps_q, time_steps_k) or
broadcastable shape, dtype: float32 or float64, the mask.
Parameters
-----------
q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
The queries.
Returns:
(out, attention_weights)
out (Tensor), shape(batch_size, time_steps_q, model_dim), the context vector.
attention_weights (Tensor): shape(batch_size, times_steps_q, time_steps_k), the attention weights.
k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
The keys.
v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
The values.
mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
The mask.
Returns
----------
out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
The context vector.
attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
The attention weights.
"""
q = _split_heads(self.affine_q(q), self.num_heads) # (B, h, T, C)
k = _split_heads(self.affine_k(k), self.num_heads)
@ -228,6 +282,28 @@ class MultiheadAttention(nn.Layer):
class LocationSensitiveAttention(nn.Layer):
"""Location Sensitive Attention module.
Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
Parameters
-----------
d_query: int
The feature size of query.
d_key : int
The feature size of key.
d_attention : int
The feature size of dimension.
location_filters : int
Filter size of attention convolution.
location_kernel_size : int
Kernel size of attention convolution.
"""
def __init__(self,
d_query: int,
d_key: int,
@ -259,6 +335,34 @@ class LocationSensitiveAttention(nn.Layer):
value,
attention_weights_cat,
mask=None):
"""Compute context vector and attention weights.
Parameters
-----------
query : Tensor [shape=(batch_size, d_query)]
The queries.
processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
The keys after linear layer.
value : Tensor [shape=(batch_size, time_steps_k, d_key)]
The values.
attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
Attention weights concat.
mask : Tensor, optional
The mask. Shape should be (batch_size, times_steps_q, time_steps_k) or broadcastable shape.
Defaults to None.
Returns
----------
attention_context : Tensor [shape=(batch_size, time_steps_q, d_attention)]
The context vector.
attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
The attention weights.
"""
processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
processed_attention_weights = self.location_layer(

View File

@ -8,28 +8,48 @@ __all__ = ["quantize", "dequantize", "STFT"]
def quantize(values, n_bands):
"""Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in [0, n_bands).
"""Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in
[0, n_bands).
Args:
values (Tensor): dtype: flaot32 or float64. the floating point value.
n_bands (int): the number of bands. The output integer Tensor's value is in the range [0, n_bans).
Parameters
-----------
values : Tensor [dtype: flaot32 or float64]
The floating point value.
Returns:
Tensor: the quantized tensor, dtype: int64.
n_bands : int
The number of bands. The output integer Tensor's value is in the range
[0, n_bans).
Returns
----------
Tensor [dtype: int 64]
The quantized tensor.
"""
quantized = paddle.cast((values + 1.0) / 2.0 * n_bands, "int64")
return quantized
def dequantize(quantized, n_bands, dtype=None):
"""Linearlly dequantize an integer Tensor into a float Tensor in the range [-1, 1).
"""Linearlly dequantize an integer Tensor into a float Tensor in the range
[-1, 1).
Args:
quantized (Tensor): dtype: int64. The quantized value in the range [0, n_bands).
n_bands (int): number of bands. The input integer Tensor's value is in the range [0, n_bans).
dtype (str, optional): data type of the output.
Returns:
Tensor: the dequantized tensor, dtype is specified by dtype.
Parameters
-----------
quantized : Tensor [dtype: int]
The quantized value in the range [0, n_bands).
n_bands : int
Number of bands. The input integer Tensor's value is in the range
[0, n_bans).
dtype : str, optional
Data type of the output.
Returns
-----------
Tensor
The dequantized tensor, dtype is specified by `dtype`. If `dtype` is
not specified, the default float data type is used.
"""
dtype = dtype or paddle.get_default_dtype()
value = (paddle.cast(quantized, dtype) + 0.5) * (2.0 / n_bands) - 1.0
@ -37,15 +57,36 @@ def dequantize(quantized, n_bands, dtype=None):
class STFT(nn.Layer):
def __init__(self, n_fft, hop_length, win_length, window="hanning"):
"""A module for computing differentiable stft transform. See `librosa.stft` for more details.
"""A module for computing stft transformation in a differentiable way.
Parameters
------------
n_fft : int
Number of samples in a frame.
hop_length : int
Number of samples shifted between adjacent frames.
win_length : int
Length of the window.
window : str, optional
Name of window function, see `scipy.signal.get_window` for more
details. Defaults to "hanning".
Notes
-----------
It behaves like ``librosa.core.stft``. See ``librosa.core.stft`` for more
details.
Given a audio which ``T`` samples, it the STFT transformation outputs a
spectrum with (C, frames) and complex dtype, where ``C = 1 + n_fft / 2``
and ``frames = 1 + T // hop_lenghth``.
Ony ``center`` and ``reflect`` padding is supported now.
Args:
n_fft (int): number of samples in a frame.
hop_length (int): number of samples shifted between adjacent frames.
win_length (int): length of the window function.
window (str, optional): name of window function, see `scipy.signal.get_window` for more details. Defaults to "hanning".
"""
def __init__(self, n_fft, hop_length, win_length, window="hanning"):
super(STFT, self).__init__()
self.hop_length = hop_length
self.n_bin = 1 + n_fft // 2
@ -73,13 +114,18 @@ class STFT(nn.Layer):
def forward(self, x):
"""Compute the stft transform.
Args:
x (Variable): shape(B, T), dtype flaot32, the input waveform.
Parameters
------------
x : Tensor [shape=(B, T)]
The input waveform.
Returns:
(real, imag)
real (Variable): shape(B, C, 1, T), dtype flaot32, the real part of the spectrogram. (C = 1 + n_fft // 2)
imag (Variable): shape(B, C, 1, T), dtype flaot32, the image part of the spectrogram. (C = 1 + n_fft // 2)
Returns
------------
real : Tensor [shape=(B, C, 1, frames)]
The real part of the spectrogram.
imag : Tensor [shape=(B, C, 1, frames)]
The image part of the spectrogram.
"""
# x(batch_size, time_steps)
# pad it first with reflect mode
@ -95,30 +141,34 @@ class STFT(nn.Layer):
return real, imag
def power(self, x):
"""Compute the power spectrogram.
"""Compute the power spectrum.
Args:
(real, imag)
real (Variable): shape(B, C, 1, T), dtype flaot32, the real part of the spectrogram.
imag (Variable): shape(B, C, 1, T), dtype flaot32, the image part of the spectrogram.
Parameters
------------
x : Tensor [shape=(B, T)]
The input waveform.
Returns:
Variable: shape(B, C, 1, T), dtype flaot32, the power spectrogram.
Returns
------------
Tensor [shape=(B, C, 1, T)]
The power spectrum.
"""
real, imag = self(x)
power = real**2 + imag**2
return power
def magnitude(self, x):
"""Compute the magnitude spectrogram.
"""Compute the magnitude of the spectrum.
Args:
(real, imag)
real (Variable): shape(B, C, 1, T), dtype flaot32, the real part of the spectrogram.
imag (Variable): shape(B, C, 1, T), dtype flaot32, the image part of the spectrogram.
Parameters
------------
x : Tensor [shape=(B, T)]
The input waveform.
Returns:
Variable: shape(B, C, 1, T), dtype flaot32, the magnitude spectrogram. It is the square root of the power spectrogram.
Returns
------------
Tensor [shape=(B, C, 1, T)]
The magnitude of the spectrum.
"""
power = self.power(x)
magnitude = paddle.sqrt(power)

View File

@ -1,90 +0,0 @@
import math
import paddle
from paddle import nn
from paddle.nn import functional as F
from paddle.nn import initializer as I
from parakeet.modules.conv import Conv1dBatchNorm
class Highway(nn.Layer):
def __init__(self, num_features):
super(Highway, self).__init__()
self.H = nn.Linear(num_features, num_features)
self.T = nn.Linear(num_features, num_features,
bias_attr=I.Constant(-1.))
self.num_features = num_features
def forward(self, x):
H = F.relu(self.H(x))
T = F.sigmoid(self.T(x)) # gate
return H * T + x * (1.0 - T)
class CBHG(nn.Layer):
def __init__(self, in_channels, out_channels_per_conv, max_kernel_size,
projection_channels,
num_highways, highway_features,
gru_features):
super(CBHG, self).__init__()
self.conv1d_banks = nn.LayerList(
[Conv1dBatchNorm(in_channels, out_channels_per_conv, (k,),
padding=((k - 1) // 2, k // 2))
for k in range(1, 1 + max_kernel_size)])
self.projections = nn.LayerList()
projection_channels = list(projection_channels)
proj_in_channels = [max_kernel_size *
out_channels_per_conv] + projection_channels
proj_out_channels = projection_channels + \
[in_channels] # ensure residual connection
for c_in, c_out in zip(proj_in_channels, proj_out_channels):
conv = nn.Conv1D(c_in, c_out, (3,), padding=(1, 1))
self.projections.append(conv)
if in_channels != highway_features:
self.pre_highway = nn.Linear(in_channels, highway_features)
self.highways = nn.LayerList(
[Highway(highway_features) for _ in range(num_highways)])
self.gru = nn.GRU(highway_features, gru_features,
direction="bidirectional")
self.in_channels = in_channels
self.out_channels_per_conv = out_channels_per_conv
self.max_kernel_size = max_kernel_size
self.num_projections = 1 + len(projection_channels)
self.num_highways = num_highways
self.highway_features = highway_features
self.gru_features = gru_features
def forward(self, x):
input = x
# conv banks
conv_outputs = []
for conv in self.conv1d_banks:
conv_outputs.append(conv(x))
x = F.relu(paddle.concat(conv_outputs, 1))
# max pool
x = F.max_pool1d(x, 2, stride=1, padding=(0, 1))
# conv1d projections
n_projections = len(self.projections)
for i, conv in enumerate(self.projections):
x = conv(x)
if i != n_projections:
x = F.relu(x)
x += input # residual connection
# highway
x = paddle.transpose(x, [0, 2, 1])
if hasattr(self, "pre_highway"):
x = self.pre_highway(x)
# gru
x, _ = self.gru(x)
return x

View File

@ -1,62 +0,0 @@
import paddle
from paddle import nn
from paddle.nn import functional as F
def residual_connection(input, layer):
"""residual connection, only used for single input-single output layer.
y = x + F(x) where F corresponds to the layer.
Args:
x (Tensor): the input tensor.
layer (callable): a callable that preserve tensor shape.
"""
return input + layer(input)
class ResidualWrapper(nn.Layer):
def __init__(self, layer):
super(ResidualWrapper, self).__init__()
self.layer = layer
def forward(self, x):
return residual_connection(x, self.layer)
class PreLayerNormWrapper(nn.Layer):
def __init__(self, layer, d_model):
super(PreLayerNormWrapper, self).__init__()
self.layer = layer
self.layer_norm = nn.LayerNorm([d_model], epsilon=1e-6)
def forward(self, x):
return x + self.layer(self.layer_norm(x))
class PostLayerNormWrapper(nn.Layer):
def __init__(self, layer, d_model):
super(PostLayerNormWrapper, self).__init__()
self.layer = layer
self.layer_norm = nn.LayerNorm([d_model], epsilon=1e-6)
def forward(self, x):
return self.layer_norm(x + self.layer(x))
def context_gate(input, axis):
"""sigmoid gate the content by gate.
Args:
input (Tensor): shape(*, d_axis, *), the input, treated as content & gate.
axis (int): the axis to chunk content and gate.
Raises:
ValueError: if input.shape[axis] is not even.
Returns:
Tensor: shape(*, d_axis / 2 , *), the gated content.
"""
size = input.shape[axis]
if size % 2 != 0:
raise ValueError("the size of the {}-th dimension of input should "
"be even, but received {}".format(axis, size))
content, gate = paddle.chunk(input, 2, axis)
return F.sigmoid(gate) * content

View File

@ -15,19 +15,69 @@
import paddle
from paddle import nn
__all__ = [
"Conv1dCell",
"Conv1dBatchNorm",
]
class Conv1dCell(nn.Conv1D):
"""
A subclass of Conv1d layer, which can be used like an RNN cell. It can take
step input and return step output. It is done by keeping an internal buffer,
when adding a step input, we shift the buffer and return a step output. For
single step case, convolution devolves to a linear transformation.
"""A subclass of Conv1D layer, which can be used in an autoregressive
decoder like an RNN cell.
When used in autoregressive decoding, it performs causal temporal
convolution incrementally. At each time step, it takes a step input and
returns a step output.
Notes
------
It is done by caching an internal buffer of length ``receptive_file - 1``.
when adding a step input, the buffer is shited by one step, the latest
input is added to be buffer and the oldest step is discarded. And it
returns a step output. For single step case, convolution is equivalent to a
linear transformation.
That it can be used as a cell depends on several restrictions:
1. stride must be 1;
2. padding must be an asymmetric padding (recpetive_field - 1, 0).
As a result, these arguments are removed form the initializer.
1. stride must be 1;
2. padding must be a causal padding (recpetive_field - 1, 0).
Thus, these arguments are removed from the ``__init__`` method of this
class.
Parameters
----------
in_channels: int
The feature size of the input.
out_channels: int
The feature size of the output.
kernel_size: int or Tuple[int]
The size of the kernel.
dilation: int or Tuple[int]
The dilation of the convolution, by default 1
weight_attr: ParamAttr, Initializer, str or bool, optional
The parameter attribute of the convolution kernel, by default None.
bias_attr: ParamAttr, Initializer, str or bool, optional
The parameter attribute of the bias. If ``False``, this layer does not
have a bias, by default None.
Examples
--------
>>> cell = Conv1dCell(3, 4, kernel_size=5)
>>> inputs = [paddle.randn([4, 3]) for _ in range(16)]
>>> outputs = []
>>> cell.eval()
>>> cell.start_sequence()
>>> for xt in inputs:
>>> outputs.append(cell.add_input(xt))
>>> len(outputs))
16
>>> outputs[0].shape
[4, 4]
"""
def __init__(self,
@ -54,9 +104,23 @@ class Conv1dCell(nn.Conv1D):
@property
def receptive_field(self):
"""The receptive field of the Conv1dCell.
"""
return self._r
def start_sequence(self):
"""Prepare the layer for a series of incremental forward.
Warnings
---------
This method should be called before a sequence of calls to
``add_input``.
Raises
------
Exception
If this method is called when the layer is in training mode.
"""
if self.training:
raise Exception("only use start_sequence in evaluation")
self._buffer = None
@ -72,21 +136,41 @@ class Conv1dCell(nn.Conv1D):
(self._out_channels, -1))
def initialize_buffer(self, x_t):
"""Initialize the buffer for the step input.
Parameters
----------
x_t : Tensor [shape=(batch_size, in_channels)]
The step input.
"""
batch_size, _ = x_t.shape
self._buffer = paddle.zeros(
(batch_size, self._in_channels, self.receptive_field),
dtype=x_t.dtype)
def update_buffer(self, x_t):
"""Shift the buffer by one step.
Parameters
----------
x_t : Tensor [shape=(batch_size, in_channels)]
The step input.
"""
self._buffer = paddle.concat(
[self._buffer[:, :, 1:], paddle.unsqueeze(x_t, -1)], -1)
def add_input(self, x_t):
"""
Arguments:
x_t (Tensor): shape (batch_size, in_channels), step input.
Rerurns:
y_t (Tensor): shape (batch_size, out_channels), step output.
"""Add step input and compute step output.
Parameters
-----------
x_t : Tensor [shape=(batch_size, in_channels)]
The step input.
Returns
-------
y_t :Tensor [shape=(batch_size, out_channels)]
The step output.
"""
batch_size = x_t.shape[0]
if self.receptive_field > 1:
@ -108,6 +192,45 @@ class Conv1dCell(nn.Conv1D):
class Conv1dBatchNorm(nn.Layer):
"""A Conv1D Layer followed by a BatchNorm1D.
Parameters
----------
in_channels : int
The feature size of the input.
out_channels : int
The feature size of the output.
kernel_size : int
The size of the convolution kernel.
stride : int, optional
The stride of the convolution, by default 1.
padding : int, str or Tuple[int], optional
The padding of the convolution.
If int, a symmetrical padding is applied before convolution;
If str, it should be "same" or "valid";
If Tuple[int], its length should be 2, meaning
``(pad_before, pad_after)``, by default 0.
weight_attr : ParamAttr, Initializer, str or bool, optional
The parameter attribute of the convolution kernel, by default None.
bias_attr : ParamAttr, Initializer, str or bool, optional
The parameter attribute of the bias of the convolution, by default
None.
data_format : str ["NCL" or "NLC"], optional
The data layout of the input, by default "NCL"
momentum : float, optional
The momentum of the BatchNorm1D layer, by default 0.9
epsilon : [type], optional
The epsilon of the BatchNorm1D layer, by default 1e-05
"""
def __init__(self,
in_channels,
out_channels,
@ -136,6 +259,18 @@ class Conv1dBatchNorm(nn.Layer):
data_format=data_format)
def forward(self, x):
"""Forward pass of the Conv1dBatchNorm layer.
Parameters
----------
x : Tensor [shape=(B, C_in, T_in) or (B, T_in, C_in)]
The input tensor. Its data layout depends on ``data_format``.
Returns
-------
Tensor [shape=(B, C_out, T_out) or (B, T_out, C_out)]
The output tensor.
"""
x = self.conv(x)
x = self.bn(x)
return x

View File

@ -4,16 +4,25 @@ import paddle
def shuffle_dim(x, axis, perm=None):
"""Permute input tensor along aixs given the permutation or randomly.
Args:
x (Tensor): shape(*, d_{axis}, *), the input tensor.
axis (int): the axis to shuffle.
perm (list[int], ndarray, optional): a permutation of [0, d_{axis}),
the order to reorder the tensor along the `axis`-th dimension, if
not provided, randomly shuffle the `axis`-th dimension. Defaults to
None.
Parameters
----------
x : Tensor
The input tensor.
Returns:
Tensor: the shuffled tensor, it has the same shape as x does.
axis : int
The axis to shuffle.
perm : List[int], ndarray, optional
The order to reorder the tensor along the ``axis``-th dimension.
It is a permutation of ``[0, d)``, where d is the size of the
``axis``-th dimension of the input tensor. If not provided,
a random permutation is used. Defaults to None.
Returns
---------
Tensor
The shuffled tensor, which has the same shape as x does.
"""
size = x.shape[axis]
if perm is not None and len(perm) != size:

View File

@ -4,29 +4,128 @@ import paddle
from paddle import nn
from paddle.nn import functional as F
__all__ = [
"weighted_mean",
"masked_l1_loss",
"masked_softmax_with_cross_entropy",
"diagonal_loss",
]
def weighted_mean(input, weight):
"""weighted mean.(It can also be used as masked mean.)
"""Weighted mean. It can also be used as masked mean.
Args:
input (Tensor): input tensor, floating point dtype.
weight (Tensor): weight tensor with broadcastable shape.
Parameters
-----------
input : Tensor
The input tensor.
weight : Tensor
The weight tensor with broadcastable shape with the input.
Returns:
Tensor: shape(1,), weighted mean tensor with the same dtype as input.
Returns
----------
Tensor [shape=(1,)]
Weighted mean tensor with the same dtype as input.
Warnings
---------
This is not a mathematical weighted mean. It performs weighted sum and
simple average.
"""
weight = paddle.cast(weight, input.dtype)
return paddle.mean(input * weight)
def masked_l1_loss(prediction, target, mask):
"""Compute maksed L1 loss.
Parameters
----------
prediction : Tensor
The prediction.
target : Tensor
The target. The shape should be broadcastable to ``prediction``.
mask : Tensor
The mask. The shape should be broadcatable to the broadcasted shape of
``prediction`` and ``target``.
Returns
-------
Tensor [shape=(1,)]
The masked L1 loss.
"""
abs_error = F.l1_loss(prediction, target, reduction='none')
return weighted_mean(abs_error, mask)
loss = weighted_mean(abs_error, mask)
return loss
def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1):
ce = F.softmax_with_cross_entropy(logits, label, axis=axis)
return weighted_mean(ce, mask)
"""Compute masked softmax with cross entropy loss.
def diagonal_loss(attentions, input_lengths, target_lengths, g=0.2, multihead=False):
"""A metric to evaluate how diagonal a attention distribution is."""
Parameters
----------
logits : Tensor
The logits. The ``axis``-th axis is the class dimension.
label : Tensor [dtype: int]
The label. The size of the ``axis``-th axis should be 1.
mask : Tensor
The mask. The shape should be broadcastable to ``label``.
axis : int, optional
The index of the class dimension in the shape of ``logits``, by default
-1.
Returns
-------
Tensor [shape=(1,)]
The masked softmax with cross entropy loss.
"""
ce = F.softmax_with_cross_entropy(logits, label, axis=axis)
loss = weighted_mean(ce, mask)
return loss
def diagonal_loss(
attentions,
input_lengths,
target_lengths,
g=0.2,
multihead=False):
"""A metric to evaluate how diagonal a attention distribution is.
It is computed for batch attention distributions. For each attention
distribution, the valid decoder time steps and encoder time steps may
differ.
Parameters
----------
attentions : Tensor [shape=(B, T_dec, T_enc) or (B, H, T_dec, T_dec)]
The attention weights from an encoder-decoder structure.
input_lengths : Tensor [shape=(B,)]
The valid length for each encoder output.
target_lengths : Tensor [shape=(B,)]
The valid length for each decoder output.
g : float, optional
[description], by default 0.2.
multihead : bool, optional
A flag indicating whether ``attentions`` is a multihead attention's
attention distribution.
If ``True``, the shape of attention is ``(B, H, T_dec, T_dec)``, by
default False.
Returns
-------
Tensor [shape=(1,)]
The diagonal loss.
"""
W = guided_attentions(input_lengths, target_lengths, g)
W_tensor = paddle.to_tensor(W)
if not multihead:

View File

@ -1,32 +1,114 @@
import paddle
from paddle.fluid.layers import sequence_mask
__all__ = [
"id_mask",
"feature_mask",
"combine_mask",
"future_mask",
]
def id_mask(input, padding_index=0, dtype="bool"):
"""Generate mask with input ids.
Those positions where the value equals ``padding_index`` correspond to 0 or
``False``, otherwise, 1 or ``True``.
Parameters
----------
input : Tensor [dtype: int]
The input tensor. It represents the ids.
padding_index : int, optional
The id which represents padding, by default 0.
dtype : str, optional
Data type of the returned mask, by default "bool".
Returns
-------
Tensor
The generate mask. It has the same shape as ``input`` does.
"""
return paddle.cast(input != padding_index, dtype)
def feature_mask(input, axis, dtype="bool"):
"""Compute mask from input features.
For a input features, represented as batched feature vectors, those vectors
which all zeros are considerd padding vectors.
Parameters
----------
input : Tensor [dtype: float]
The input tensor which represents featues.
axis : int
The index of the feature dimension in ``input``. Other dimensions are
considered ``spatial`` dimensions.
dtype : str, optional
Data type of the generated mask, by default "bool"
Returns
-------
Tensor
The geenrated mask with ``spatial`` shape as mentioned above.
It has one less dimension than ``input`` does.
"""
feature_sum = paddle.sum(paddle.abs(input), axis)
return paddle.cast(feature_sum != 0, dtype)
def combine_mask(padding_mask, no_future_mask):
"""
Combine the padding mask and no future mask for transformer decoder.
Padding mask is used to mask padding positions and no future mask is used
to prevent the decoder to see future information.
Args:
padding_mask (Tensor): shape(batch_size, time_steps), dtype: float32 or float64, decoder padding mask.
no_future_mask (Tensor): shape(time_steps, time_steps), dtype: float32 or float64, no future mask.
def combine_mask(mask1, mask2):
"""Combine two mask with multiplication or logical and.
Returns:
Tensor: shape(batch_size, time_steps, time_steps), combined mask.
Parameters
-----------
mask1 : Tensor
The first mask.
mask2 : Tensor
The second mask with broadcastable shape with ``mask1``.
Returns
--------
Tensor
Combined mask.
Notes
------
It is mainly used to combine the padding mask and no future mask for
transformer decoder.
Padding mask is used to mask padding positions of the decoder inputs and
no future mask is used to prevent the decoder to see future information.
"""
# TODO: to support boolean mask by using logical_and?
if padding_mask.dtype == paddle.fluid.core.VarDesc.VarType.BOOL:
return paddle.logical_and(padding_mask, no_future_mask)
if mask1.dtype == paddle.fluid.core.VarDesc.VarType.BOOL:
return paddle.logical_and(mask1, mask2)
else:
return padding_mask * no_future_mask
return mask1 * mask2
def future_mask(time_steps, dtype="bool"):
"""Generate lower triangular mask.
It is used at transformer decoder to prevent the decoder to see future
information.
Parameters
----------
time_steps : int
Decoder time steps.
dtype : str, optional
The data type of the generate mask, by default "bool".
Returns
-------
Tensor
The generated mask.
"""
mask = paddle.tril(paddle.ones([time_steps, time_steps]))
return paddle.cast(mask, dtype)

View File

@ -3,21 +3,34 @@ import numpy as np
import paddle
from paddle.nn import functional as F
__all__ = ["positional_encoding"]
def positional_encoding(start_index, length, size, dtype=None):
"""
Generate standard positional encoding.
r"""Generate standard positional encoding matrix.
pe(pos, 2i) = sin(pos / 10000 ** (2i / size))
pe(pos, 2i+1) = cos(pos / 10000 ** (2i / size))
.. math::
Args:
start_index (int): the start index.
length (int): the length of the positional encoding.
size (int): positional encoding dimension.
pe(pos, 2i) = sin(\frac{pos}{10000^{\frac{2i}{size}}}) \\
pe(pos, 2i+1) = cos(\frac{pos}{10000^{\frac{2i}{size}}})
Returns:
encodings (Tensor): shape(length, size), the positional encoding.
Parameters
----------
start_index : int
The start index.
length : int
The timesteps of the positional encoding to generate.
size : int
Feature size of positional encoding.
Returns
-------
Tensor [shape=(length, size)]
The positional encoding.
Raises
------
ValueError
If ``size`` is not divisible by 2.
"""
if (size % 2 != 0):
raise ValueError("size should be divisible by 2")

View File

@ -5,23 +5,35 @@ from paddle.nn import functional as F
from parakeet.modules import attention as attn
from parakeet.modules.masking import combine_mask
__all__ = [
"PositionwiseFFN",
"TransformerEncoderLayer",
"TransformerDecoderLayer",
]
class PositionwiseFFN(nn.Layer):
"""
A faithful implementation of Position-wise Feed-Forward Network
"""A faithful implementation of Position-wise Feed-Forward Network
in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
It is basically a 3-layer MLP, with relu actication and dropout in between.
It is basically a 2-layer MLP, with relu actication and dropout in between.
Parameters
----------
input_size: int
The feature size of the intput. It is also the feature size of the
output.
hidden_size: int
The hidden size.
dropout: float
The probability of the Dropout applied to the output of the first
layer, by default 0.
"""
def __init__(self,
input_size: int,
hidden_size: int,
dropout=0.0):
"""
Args:
input_size (int): the input feature size.
hidden_size (int): the hidden layer's feature size.
dropout (float, optional): probability of dropout applied to the
output of the first fully connected layer. Defaults to 0.0.
"""
super(PositionwiseFFN, self).__init__()
self.linear1 = nn.Linear(input_size, hidden_size)
self.linear2 = nn.Linear(hidden_size, input_size)
@ -31,13 +43,17 @@ class PositionwiseFFN(nn.Layer):
self.hidden_szie = hidden_size
def forward(self, x):
"""positionwise feed forward network.
r"""Forward pass of positionwise feed forward network.
Args:
x (Tensor): shape(*, input_size), the input tensor.
Parameters
----------
x : Tensor [shape=(\*, input_size)]
The input tensor, where ``\*`` means arbitary shape.
Returns:
Tensor: shape(*, input_size), the output tensor.
Returns
-------
Tensor [shape=(\*, input_size)]
The output tensor.
"""
l1 = self.dropout(F.relu(self.linear1(x)))
l2 = self.linear2(l1)
@ -45,18 +61,32 @@ class PositionwiseFFN(nn.Layer):
class TransformerEncoderLayer(nn.Layer):
"""
Transformer encoder layer.
"""A faithful implementation of Transformer encoder layer in
`Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
Parameters
----------
d_model :int
The feature size of the input. It is also the feature size of the
output.
n_heads : int
The number of heads of self attention (a ``MultiheadAttention``
layer).
d_ffn : int
The hidden size of the positional feed forward network (a
``PositionwiseFFN`` layer).
dropout : float, optional
The probability of the dropout in MultiHeadAttention and
PositionwiseFFN, by default 0.
Notes
------
It uses the PostLN (post layer norm) scheme.
"""
def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
"""
Args:
d_model (int): the feature size of the input, and the output.
n_heads (int): the number of heads in the internal MultiHeadAttention layer.
d_ffn (int): the hidden size of the internal PositionwiseFFN.
dropout (float, optional): the probability of the dropout in
MultiHeadAttention and PositionwiseFFN. Defaults to 0.
"""
super(TransformerEncoderLayer, self).__init__()
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
@ -64,37 +94,68 @@ class TransformerEncoderLayer(nn.Layer):
self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
self.dropout = dropout
def forward(self, x, mask):
"""
Args:
x (Tensor): shape(batch_size, time_steps, d_model), the decoder input.
mask (Tensor): shape(batch_size, time_steps), the padding mask.
"""Forward pass of TransformerEncoderLayer.
Returns:
(x, attn_weights)
x (Tensor): shape(batch_size, time_steps, d_model), the decoded.
attn_weights (Tensor), shape(batch_size, n_heads, time_steps, time_steps), self attention.
"""
context_vector, attn_weights = self.self_mha(x, x, x, paddle.unsqueeze(mask, 1))
x = self.layer_norm1(x + context_vector)
Parameters
----------
x : Tensor [shape=(batch_size, time_steps, d_model)]
The input.
x = self.layer_norm2(x + self.ffn(x))
mask : Tensor
The padding mask. The shape is (batch_size, time_steps,
time_steps) or broadcastable shape.
Returns
-------
x :Tensor [shape=(batch_size, time_steps, d_model)]
The encoded output.
attn_weights : Tensor [shape=(batch_size, n_heads, time_steps, time_steps)]
The attention weights of the self attention.
"""
context_vector, attn_weights = self.self_mha(x, x, x, mask)
x = self.layer_norm1(
F.dropout(x + context_vector,
self.dropout,
training=self.training))
x = self.layer_norm2(
F.dropout(x + self.ffn(x),
self.dropout,
training=self.training))
return x, attn_weights
class TransformerDecoderLayer(nn.Layer):
"""
Transformer decoder layer.
"""A faithful implementation of Transformer decoder layer in
`Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
Parameters
----------
d_model :int
The feature size of the input. It is also the feature size of the
output.
n_heads : int
The number of heads of attentions (``MultiheadAttention``
layers).
d_ffn : int
The hidden size of the positional feed forward network (a
``PositionwiseFFN`` layer).
dropout : float, optional
The probability of the dropout in MultiHeadAttention and
PositionwiseFFN, by default 0.
Notes
------
It uses the PostLN (post layer norm) scheme.
"""
def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
"""
Args:
d_model (int): the feature size of the input, and the output.
n_heads (int): the number of heads in the internal MultiHeadAttention layer.
d_ffn (int): the hidden size of the internal PositionwiseFFN.
dropout (float, optional): the probability of the dropout in
MultiHeadAttention and PositionwiseFFN. Defaults to 0.
"""
super(TransformerDecoderLayer, self).__init__()
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
@ -105,29 +166,51 @@ class TransformerDecoderLayer(nn.Layer):
self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
self.dropout = dropout
def forward(self, q, k, v, encoder_mask, decoder_mask):
"""Forward pass of TransformerEncoderLayer.
Parameters
----------
q : Tensor [shape=(batch_size, time_steps_q, d_model)]
The decoder input.
k : Tensor [shape=(batch_size, time_steps_k, d_model)]
The keys.
v : Tensor [shape=(batch_size, time_steps_k, d_model)]
The values
encoder_mask : Tensor
Encoder padding mask, shape is ``(batch_size, time_steps_k,
time_steps_k)`` or broadcastable shape.
decoder_mask : Tensor
Decoder mask, shape is ``(batch_size, time_steps_q, time_steps_k)``
or broadcastable shape.
Returns
--------
q : Tensor [shape=(batch_size, time_steps_q, d_model)]
The decoder output.
self_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_q)]
Decoder self attention.
cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)]
Decoder-encoder cross attention.
"""
Args:
q (Tensor): shape(batch_size, time_steps_q, d_model), the decoder input.
k (Tensor): shape(batch_size, time_steps_k, d_model), keys.
v (Tensor): shape(batch_size, time_steps_k, d_model), values
encoder_mask (Tensor): shape(batch_size, time_steps_k) encoder padding mask.
decoder_mask (Tensor): shape(batch_size, time_steps_q) decoder padding mask.
context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask)
q = self.layer_norm1(
F.dropout(q + context_vector,
self.dropout,
training=self.training))
Returns:
(q, self_attn_weights, cross_attn_weights)
q (Tensor): shape(batch_size, time_steps_q, d_model), the decoded.
self_attn_weights (Tensor), shape(batch_size, n_heads, time_steps_q, time_steps_q), decoder self attention.
cross_attn_weights (Tensor), shape(batch_size, n_heads, time_steps_q, time_steps_k), decoder-encoder cross attention.
"""
tq = q.shape[1]
no_future_mask = paddle.tril(paddle.ones([tq, tq])) #(tq, tq)
combined_mask = combine_mask(decoder_mask.unsqueeze(1), no_future_mask)
context_vector, self_attn_weights = self.self_mha(q, q, q, combined_mask)
q = self.layer_norm1(q + context_vector)
context_vector, cross_attn_weights = self.cross_mha(q, k, v, encoder_mask)
q = self.layer_norm2(
F.dropout(q + context_vector,
self.dropout,
training=self.training))
context_vector, cross_attn_weights = self.cross_mha(q, k, v, paddle.unsqueeze(encoder_mask, 1))
q = self.layer_norm2(q + context_vector)
q = self.layer_norm3(q + self.ffn(q))
q = self.layer_norm3(
F.dropout(q + self.ffn(q),
self.dropout,
training=self.training))
return q, self_attn_weights, cross_attn_weights

View File

@ -0,0 +1,2 @@
from parakeet.training.cli import *
from parakeet.training.experiment import *

View File

@ -1,12 +1,40 @@
import argparse
def default_argument_parser():
r"""A simple yet genral argument parser for experiments with parakeet.
This is used in examples with parakeet. And it is intended to be used by
other experiments with parakeet. It requires a minimal set of command line
arguments to start a training script.
The ``--config`` and ``--opts`` are used for overwrite the deault
configuration.
The ``--data`` and ``--output`` specifies the data path and output path.
Resuming training from existing progress at the output directory is the
intended default behavior.
The ``--checkpoint_path`` specifies the checkpoint to load from.
The ``--device`` and ``--nprocs`` specifies how to run the training.
See Also
--------
parakeet.training.experiment
Returns
-------
argparse.ArgumentParser
the parser
"""
parser = argparse.ArgumentParser()
# yapf: disable
# data and outpu
parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.")
parser.add_argument("--data", metavar="DATA_DIR", help="path to the datatset.")
parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and log. If not provided, a directory is created in runs/ to save outputs.")
parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and logs.")
# load from saved checkpoint
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load")
@ -17,5 +45,6 @@ def default_argument_parser():
# overwrite extra config and default config
parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
# yapd: enable
return parser

View File

@ -25,49 +25,67 @@ from collections import defaultdict
import parakeet
from parakeet.utils import checkpoint, mp_tools
__all__ = ["ExperimentBase"]
class ExperimentBase(object):
"""
An experiment template in order to structure the training code and take care of saving, loading, logging, visualization stuffs. It's intended to be flexible and simple.
An experiment template in order to structure the training code and take
care of saving, loading, logging, visualization stuffs. It's intended to
be flexible and simple.
So it only handles output directory (create directory for the outut, create a checkpoint directory, dump the config in use and create visualizer and logger)in a standard way without restricting the input/output protocols of the model and dataloader. It leaves the main part for the user to implement their own(setup the model, criterion, optimizer, defaine a training step, define a validation function and customize all the text and visual logs).
So it only handles output directory (create directory for the output,
create a checkpoint directory, dump the config in use and create
visualizer and logger) in a standard way without enforcing any
input-output protocols to the model and dataloader. It leaves the main
part for the user to implement their own (setup the model, criterion,
optimizer, define a training step, define a validation function and
customize all the text and visual logs).
It does not save too much boilerplate code. The users still have to write the forward/backward/update mannually, but they are free to add non-standard behaviors if needed.
It does not save too much boilerplate code. The users still have to write
the forward/backward/update mannually, but they are free to add
non-standard behaviors if needed.
We have some conventions to follow.
1. Experiment should have `.model`, `.optimizer`, `.train_loader` and `.valid_loader`, `.config`, `.args` attributes.
2. The config should have a `.training` field, which has `valid_interval`, `save_interval` and `max_iteration` keys. It is used as the trigger to invoke validation, checkpointing and stop of the experiment.
3. There are four method, namely `train_batch`, `valid`, `setup_model` and `setup_dataloader` that should be implemented.
1. Experiment should have ``model``, ``optimizer``, ``train_loader`` and
``valid_loader``, ``config`` and ``args`` attributes.
2. The config should have a ``training`` field, which has
``valid_interval``, ``save_interval`` and ``max_iteration`` keys. It is
used as the trigger to invoke validation, checkpointing and stop of the
experiment.
3. There are four methods, namely ``train_batch``, ``valid``,
``setup_model`` and ``setup_dataloader`` that should be implemented.
Feel free to add/overwrite other methods and standalone functions if you need.
Feel free to add/overwrite other methods and standalone functions if you
need.
Examples:
Parameters
----------
config: yacs.config.CfgNode
The configuration used for the experiment.
args: argparse.Namespace
The parsed command line arguments.
Examples
--------
def main_sp(config, args):
exp = Experiment(config, args)
exp.setup()
exp.run()
def main(config, args):
if args.nprocs > 1 and args.device == "gpu":
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
else:
main_sp(config, args)
if __name__ == "__main__":
config = get_cfg_defaults()
parser = default_argument_parser()
args = parser.parse_args()
if args.config:
config.merge_from_file(args.config)
if args.opts:
config.merge_from_list(args.opts)
config.freeze()
print(config)
print(args)
main(config, args)
>>> def main_sp(config, args):
>>> exp = Experiment(config, args)
>>> exp.setup()
>>> exp.run()
>>>
>>> config = get_cfg_defaults()
>>> parser = default_argument_parser()
>>> args = parser.parse_args()
>>> if args.config:
>>> config.merge_from_file(args.config)
>>> if args.opts:
>>> config.merge_from_list(args.opts)
>>> config.freeze()
>>>
>>> if args.nprocs > 1 and args.device == "gpu":
>>> dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
>>> else:
>>> main_sp(config, args)
"""
def __init__(self, config, args):
@ -75,6 +93,8 @@ class ExperimentBase(object):
self.args = args
def setup(self):
"""Setup the experiment.
"""
paddle.set_device(self.args.device)
if self.parallel:
self.init_parallel()
@ -93,16 +113,29 @@ class ExperimentBase(object):
@property
def parallel(self):
"""A flag indicating whether the experiment should run with
multiprocessing.
"""
return self.args.device == "gpu" and self.args.nprocs > 1
def init_parallel(self):
"""Init environment for multiprocess training.
"""
dist.init_parallel_env()
def save(self):
"""Save checkpoint (model parameters and optimizer states).
"""
checkpoint.save_parameters(self.checkpoint_dir, self.iteration,
self.model, self.optimizer)
def resume_or_load(self):
"""Resume from latest checkpoint at checkpoints in the output
directory or load a specified checkpoint.
If ``args.checkpoint_path`` is not None, load the checkpoint, else
resume training.
"""
iteration = checkpoint.load_parameters(
self.model,
self.optimizer,
@ -111,6 +144,13 @@ class ExperimentBase(object):
self.iteration = iteration
def read_batch(self):
"""Read a batch from the train_loader.
Returns
-------
List[Tensor]
A batch.
"""
try:
batch = next(self.iterator)
except StopIteration:
@ -119,12 +159,19 @@ class ExperimentBase(object):
return batch
def new_epoch(self):
"""Reset the train loader and increment ``epoch``.
"""
self.epoch += 1
if self.parallel:
self.train_loader.batch_sampler.set_epoch(self.epoch)
self.iterator = iter(self.train_loader)
def train(self):
"""The training process.
It includes forward/backward/update and periodical validation and
saving.
"""
self.new_epoch()
while self.iteration < self.config.training.max_iteration:
self.iteration += 1
@ -137,6 +184,9 @@ class ExperimentBase(object):
self.save()
def run(self):
"""The routine of the experiment after setup. This method is intended
to be used by the user.
"""
self.resume_or_load()
try:
self.train()
@ -146,6 +196,8 @@ class ExperimentBase(object):
@mp_tools.rank_zero_only
def setup_output_dir(self):
"""Create a directory used for output.
"""
# output dir
output_dir = Path(self.args.output).expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
@ -154,6 +206,10 @@ class ExperimentBase(object):
@mp_tools.rank_zero_only
def setup_checkpointer(self):
"""Create a directory used to save checkpoints into.
It is "checkpoints" inside the output directory.
"""
# checkpoint dir
checkpoint_dir = self.output_dir / "checkpoints"
checkpoint_dir.mkdir(exist_ok=True)
@ -162,12 +218,28 @@ class ExperimentBase(object):
@mp_tools.rank_zero_only
def setup_visualizer(self):
"""Initialize a visualizer to log the experiment.
The visual log is saved in the output directory.
Notes
------
Only the main process has a visualizer with it. Use multiple
visualizers in multiprocess to write to a same log file may cause
unexpected behaviors.
"""
# visualizer
visualizer = SummaryWriter(logdir=str(self.output_dir))
self.visualizer = visualizer
def setup_logger(self):
"""Initialize a text logger to log the experiment.
Each process has its own text logger. The logging message is write to
the standard output and a text file named ``worker_n.log`` in the
output directory, where ``n`` means the rank of the process.
"""
logger = logging.getLogger(__name__)
logger.setLevel("INFO")
logger.addHandler(logging.StreamHandler())
@ -178,19 +250,34 @@ class ExperimentBase(object):
@mp_tools.rank_zero_only
def dump_config(self):
"""Save the configuration used for this experiment.
It is saved in to ``config.yaml`` in the output directory at the
beginning of the experiment.
"""
with open(self.output_dir / "config.yaml", 'wt') as f:
print(self.config, file=f)
def train_batch(self):
"""The training loop. A subclass should implement this method.
"""
raise NotImplementedError("train_batch should be implemented.")
@mp_tools.rank_zero_only
@paddle.no_grad()
def valid(self):
"""The validation. A subclass should implement this method.
"""
raise NotImplementedError("valid should be implemented.")
def setup_model(self):
"""Setup model, criterion and optimizer, etc. A subclass should
implement this method.
"""
raise NotImplementedError("setup_model should be implemented.")
def setup_dataloader(self):
"""Setup training dataloader and validation dataloader. A subclass
should implement this method.
"""
raise NotImplementedError("setup_dataloader should be implemented.")

View File

@ -56,15 +56,14 @@ setup_info = dict(
'unidecode',
'numba==0.47.0',
'tqdm==4.19.8',
'llvmlite==0.31.0',
'matplotlib',
'visualdl>=2.0.1',
'scipy',
'ruamel.yaml',
'pandas',
'sox',
'soundfile',
'llvmlite==0.31.0',
'opencc',
'soundfile',
'g2p_en',
'g2pM',
'yacs',