use g2p in deepvoice3
This commit is contained in:
parent
b7e74f7889
commit
fe4b471036
|
@ -14,7 +14,7 @@ from hparams import hparams, hparams_debug_string
|
|||
from data.data import TextDataSource, MelSpecDataSource
|
||||
from nnmnkwii.datasets import FileSourceDataset
|
||||
from tqdm import trange
|
||||
from modules import frontend
|
||||
import g2p as frontend
|
||||
|
||||
|
||||
def build_parser():
|
||||
|
|
|
@ -25,7 +25,7 @@ import random
|
|||
|
||||
# import global hyper parameters
|
||||
from hparams import hparams
|
||||
from modules import frontend
|
||||
import g2p as frontend
|
||||
import builder
|
||||
|
||||
_frontend = getattr(frontend, hparams.frontend)
|
||||
|
|
|
@ -17,7 +17,7 @@ from paddle import fluid
|
|||
import paddle.fluid.dygraph as dg
|
||||
|
||||
from hparams import hparams, hparams_debug_string
|
||||
from modules import frontend
|
||||
import g2p as frontend
|
||||
from deepvoice3 import DeepVoiceTTS
|
||||
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ from tensorboardX import SummaryWriter
|
|||
|
||||
# import global hyper parameters
|
||||
from hparams import hparams
|
||||
from modules import frontend
|
||||
import g2p as frontend
|
||||
|
||||
_frontend = getattr(frontend, hparams.frontend)
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ import paddle.fluid.dygraph as dg
|
|||
|
||||
sys.path.append("../")
|
||||
import audio
|
||||
from modules import frontend
|
||||
import g2p as frontend
|
||||
import dry_run
|
||||
|
||||
from hparams import hparams
|
||||
|
|
|
@ -32,7 +32,7 @@ from data import (TextDataSource, MelSpecDataSource,
|
|||
LinearSpecDataSource,
|
||||
PartialyRandomizedSimilarTimeLengthSampler,
|
||||
Dataset, make_loader, create_batch)
|
||||
from modules import frontend
|
||||
import g2p as frontend
|
||||
from builder import deepvoice3, WindowRange
|
||||
from dry_run import dry_run
|
||||
from train_model import train_model
|
||||
|
|
222
modules/conv.py
222
modules/conv.py
|
@ -1,222 +0,0 @@
|
|||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
import paddle
|
||||
from paddle import fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
||||
from weight_norm import Conv2D, Conv2DTranspose
|
||||
|
||||
|
||||
class Conv1D(dg.Layer):
|
||||
"""
|
||||
A convolution 1D block implemented with Conv2D. Form simplicity and
|
||||
ensuring the output has the same length as the input, it does not allow
|
||||
stride > 1.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
name_scope,
|
||||
in_cahnnels,
|
||||
num_filters,
|
||||
filter_size=3,
|
||||
dilation=1,
|
||||
groups=None,
|
||||
causal=False,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
use_cudnn=True,
|
||||
act=None,
|
||||
dtype="float32"):
|
||||
super(Conv1D, self).__init__(name_scope, dtype=dtype)
|
||||
|
||||
if causal:
|
||||
padding = dilation * (filter_size - 1)
|
||||
else:
|
||||
padding = (dilation * (filter_size - 1)) // 2
|
||||
|
||||
self.in_channels = in_cahnnels
|
||||
self.num_filters = num_filters
|
||||
self.filter_size = filter_size
|
||||
self.dilation = dilation
|
||||
self.causal = causal
|
||||
self.padding = padding
|
||||
self.act = act
|
||||
|
||||
self.conv = Conv2D(
|
||||
self.full_name(),
|
||||
num_filters=num_filters,
|
||||
filter_size=(1, filter_size),
|
||||
stride=(1, 1),
|
||||
dilation=(1, dilation),
|
||||
padding=(0, padding),
|
||||
groups=groups,
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_cudnn=use_cudnn,
|
||||
act=act,
|
||||
dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Args:
|
||||
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
|
||||
input channels.
|
||||
|
||||
Returns:
|
||||
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
|
||||
output channels (num_filters).
|
||||
"""
|
||||
x = self.conv(x)
|
||||
if self.filter_size > 1:
|
||||
if self.causal:
|
||||
x = fluid.layers.slice(
|
||||
x, axes=[3], starts=[0], ends=[-self.padding])
|
||||
elif self.filter_size % 2 == 0:
|
||||
x = fluid.layers.slice(x, axes=[3], starts=[0], ends=[-1])
|
||||
return x
|
||||
|
||||
def start_new_sequence(self):
|
||||
self.temp_weight = None
|
||||
self.input_buffer = None
|
||||
|
||||
def add_input(self, x):
|
||||
"""
|
||||
Adding input for a time step and compute an output for a time step.
|
||||
|
||||
Args:
|
||||
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
|
||||
input channels, and T = 1.
|
||||
|
||||
Returns:
|
||||
out (Variable): Shape(B, C_out, 1, T), the outputs, where C_out
|
||||
means output channels (num_filters), and T = 1.
|
||||
|
||||
"""
|
||||
if self.temp_weight is None:
|
||||
self.temp_weight = self._reshaped_weight()
|
||||
|
||||
window_size = 1 + (self.filter_size - 1) * self.dilation
|
||||
batch_size = x.shape[0]
|
||||
in_channels = x.shape[1]
|
||||
|
||||
if self.filter_size > 1:
|
||||
if self.input_buffer is None:
|
||||
self.input_buffer = fluid.layers.fill_constant(
|
||||
[batch_size, in_channels, 1, window_size - 1],
|
||||
dtype=x.dtype,
|
||||
value=0.0)
|
||||
else:
|
||||
self.input_buffer = self.input_buffer[:, :, :, 1:]
|
||||
self.input_buffer = fluid.layers.concat(
|
||||
[self.input_buffer, x], axis=3)
|
||||
x = self.input_buffer
|
||||
if self.dilation > 1:
|
||||
if not hasattr(self, "indices"):
|
||||
self.indices = dg.to_variable(
|
||||
np.arange(0, window_size, self.dilation))
|
||||
tmp = fluid.layers.transpose(
|
||||
self.input_buffer, perm=[3, 1, 2, 0])
|
||||
tmp = fluid.layers.gather(tmp, index=self.indices)
|
||||
tmp = fluid.layers.transpose(tmp, perm=[3, 1, 2, 0])
|
||||
x = tmp
|
||||
inputs = fluid.layers.reshape(
|
||||
x, shape=[batch_size, in_channels * 1 * self.filter_size])
|
||||
out = fluid.layers.matmul(inputs, self.temp_weight, transpose_y=True)
|
||||
out = fluid.layers.elementwise_add(out, self.conv._bias_param, axis=-1)
|
||||
out = fluid.layers.reshape(out, out.shape + [1, 1])
|
||||
out = self._helper.append_activation(out, act=self.act)
|
||||
return out
|
||||
|
||||
def _reshaped_weight(self):
|
||||
"""
|
||||
Get the linearized weight of convolution filter, cause it is by nature
|
||||
a matmul weight. And because the model uses weight norm, compute the
|
||||
weight by weight_v * weight_g to make it faster.
|
||||
|
||||
Returns:
|
||||
weight_matrix (Variable): Shape(C_out, C_in * 1 * kernel_size)
|
||||
"""
|
||||
shape = self.conv._filter_param_v.shape
|
||||
matrix_shape = [shape[0], np.prod(shape[1:])]
|
||||
weight_matrix = fluid.layers.reshape(
|
||||
self.conv._filter_param_v, shape=matrix_shape)
|
||||
weight_matrix = fluid.layers.elementwise_mul(
|
||||
fluid.layers.l2_normalize(
|
||||
weight_matrix, axis=1),
|
||||
self.conv._filter_param_g,
|
||||
axis=0)
|
||||
return weight_matrix
|
||||
|
||||
|
||||
class Conv1DTranspose(dg.Layer):
|
||||
"""
|
||||
A convolutional transpose 1D block implemented with convolutional transpose
|
||||
2D. It does not ensure that the output is exactly expanded stride times in
|
||||
time dimension.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
name_scope,
|
||||
in_channels,
|
||||
num_filters,
|
||||
filter_size,
|
||||
padding=0,
|
||||
stride=1,
|
||||
dilation=1,
|
||||
groups=None,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
use_cudnn=True,
|
||||
act=None,
|
||||
dtype="float32"):
|
||||
super(Conv1DTranspose, self).__init__(name_scope, dtype=dtype)
|
||||
|
||||
self.in_channels = in_channels
|
||||
self.num_filters = num_filters
|
||||
self.filter_size = filter_size
|
||||
self.padding = padding
|
||||
self.stride = stride
|
||||
self.dilation = dilation
|
||||
self.groups = groups
|
||||
|
||||
self.conv_transpose = Conv2DTranspose(
|
||||
self.full_name(),
|
||||
num_filters,
|
||||
filter_size=(1, filter_size),
|
||||
padding=(0, padding),
|
||||
stride=(1, stride),
|
||||
dilation=(1, dilation),
|
||||
groups=groups,
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_cudnn=use_cudnn,
|
||||
act=act,
|
||||
dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Argss:
|
||||
x (Variable): Shape(B, C_in, 1, T_in), where C_in means the input
|
||||
channels and T_in means the number of time steps of input.
|
||||
|
||||
Returns:
|
||||
out (Variable): shape(B, C_out, 1, T_out), where C_out means the
|
||||
output channels and T_out means the number of time steps of
|
||||
input.
|
||||
"""
|
||||
return self.conv_transpose(x)
|
|
@ -1 +0,0 @@
|
|||
This package is adapted from https://github.com/r9y9/deepvoice3_pytorch/tree/master/deepvoice3_pytorch/frontend, Copyright (c) 2017: Ryuichi Yamamoto, whose license applies.
|
|
@ -1,33 +0,0 @@
|
|||
# coding: utf-8
|
||||
"""Text processing frontend
|
||||
|
||||
All frontend module should have the following functions:
|
||||
|
||||
- text_to_sequence(text, p)
|
||||
- sequence_to_text(sequence)
|
||||
|
||||
and the property:
|
||||
|
||||
- n_vocab
|
||||
|
||||
"""
|
||||
from . import en
|
||||
|
||||
# optinoal Japanese frontend
|
||||
try:
|
||||
from . import jp
|
||||
except ImportError:
|
||||
jp = None
|
||||
|
||||
try:
|
||||
from . import ko
|
||||
except ImportError:
|
||||
ko = None
|
||||
|
||||
# if you are going to use the frontend, you need to modify _characters in
|
||||
# symbol.py:
|
||||
# _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' + '¡¿ñáéíóúÁÉÍÓÚÑ'
|
||||
try:
|
||||
from . import es
|
||||
except ImportError:
|
||||
es = None
|
|
@ -1,35 +0,0 @@
|
|||
# coding: utf-8
|
||||
from modules.frontend.text.symbols import symbols
|
||||
|
||||
import nltk
|
||||
from random import random
|
||||
|
||||
n_vocab = len(symbols)
|
||||
|
||||
_arpabet = nltk.corpus.cmudict.dict()
|
||||
|
||||
|
||||
def _maybe_get_arpabet(word, p):
|
||||
try:
|
||||
phonemes = _arpabet[word][0]
|
||||
phonemes = " ".join(phonemes)
|
||||
except KeyError:
|
||||
return word
|
||||
|
||||
return '{%s}' % phonemes if random() < p else word
|
||||
|
||||
|
||||
def mix_pronunciation(text, p):
|
||||
text = ' '.join(_maybe_get_arpabet(word, p) for word in text.split(' '))
|
||||
return text
|
||||
|
||||
|
||||
def text_to_sequence(text, p=0.0):
|
||||
if p >= 0:
|
||||
text = mix_pronunciation(text, p)
|
||||
from modules.frontend.text import text_to_sequence
|
||||
text = text_to_sequence(text, ["english_cleaners"])
|
||||
return text
|
||||
|
||||
|
||||
from modules.frontend.text import sequence_to_text
|
|
@ -1,16 +0,0 @@
|
|||
# coding: utf-8
|
||||
from deepvoice3_paddle.frontend.text.symbols import symbols
|
||||
|
||||
import nltk
|
||||
from random import random
|
||||
|
||||
n_vocab = len(symbols)
|
||||
|
||||
|
||||
def text_to_sequence(text, p=0.0):
|
||||
from deepvoice3_paddle.frontend.text import text_to_sequence
|
||||
text = text_to_sequence(text, ["basic_cleaners"])
|
||||
return text
|
||||
|
||||
|
||||
from deepvoice3_paddle.frontend.text import sequence_to_text
|
|
@ -1,77 +0,0 @@
|
|||
# coding: utf-8
|
||||
|
||||
import MeCab
|
||||
import jaconv
|
||||
from random import random
|
||||
|
||||
n_vocab = 0xffff
|
||||
|
||||
_eos = 1
|
||||
_pad = 0
|
||||
_tagger = None
|
||||
|
||||
|
||||
def _yomi(mecab_result):
|
||||
tokens = []
|
||||
yomis = []
|
||||
for line in mecab_result.split("\n")[:-1]:
|
||||
s = line.split("\t")
|
||||
if len(s) == 1:
|
||||
break
|
||||
token, rest = s
|
||||
rest = rest.split(",")
|
||||
tokens.append(token)
|
||||
yomi = rest[7] if len(rest) > 7 else None
|
||||
yomi = None if yomi == "*" else yomi
|
||||
yomis.append(yomi)
|
||||
|
||||
return tokens, yomis
|
||||
|
||||
|
||||
def _mix_pronunciation(tokens, yomis, p):
|
||||
return "".join(yomis[idx]
|
||||
if yomis[idx] is not None and random() < p else tokens[idx]
|
||||
for idx in range(len(tokens)))
|
||||
|
||||
|
||||
def mix_pronunciation(text, p):
|
||||
global _tagger
|
||||
if _tagger is None:
|
||||
_tagger = MeCab.Tagger("")
|
||||
tokens, yomis = _yomi(_tagger.parse(text))
|
||||
return _mix_pronunciation(tokens, yomis, p)
|
||||
|
||||
|
||||
def add_punctuation(text):
|
||||
last = text[-1]
|
||||
if last not in [".", ",", "、", "。", "!", "?", "!", "?"]:
|
||||
text = text + "。"
|
||||
return text
|
||||
|
||||
|
||||
def normalize_delimitor(text):
|
||||
text = text.replace(",", "、")
|
||||
text = text.replace(".", "。")
|
||||
text = text.replace(",", "、")
|
||||
text = text.replace(".", "。")
|
||||
return text
|
||||
|
||||
|
||||
def text_to_sequence(text, p=0.0):
|
||||
for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】", "(", ")", "(", ")"]:
|
||||
text = text.replace(c, "")
|
||||
text = text.replace("!", "!")
|
||||
text = text.replace("?", "?")
|
||||
|
||||
text = normalize_delimitor(text)
|
||||
text = jaconv.normalize(text)
|
||||
if p > 0:
|
||||
text = mix_pronunciation(text, p)
|
||||
text = jaconv.hira2kata(text)
|
||||
text = add_punctuation(text)
|
||||
|
||||
return [ord(c) for c in text] + [_eos] # EOS
|
||||
|
||||
|
||||
def sequence_to_text(seq):
|
||||
return "".join(chr(n) for n in seq)
|
|
@ -1,17 +0,0 @@
|
|||
# coding: utf-8
|
||||
|
||||
from random import random
|
||||
|
||||
n_vocab = 0xffff
|
||||
|
||||
_eos = 1
|
||||
_pad = 0
|
||||
_tagger = None
|
||||
|
||||
|
||||
def text_to_sequence(text, p=0.0):
|
||||
return [ord(c) for c in text] + [_eos] # EOS
|
||||
|
||||
|
||||
def sequence_to_text(seq):
|
||||
return "".join(chr(n) for n in seq)
|
|
@ -1,74 +0,0 @@
|
|||
import re
|
||||
from . import cleaners
|
||||
from .symbols import symbols
|
||||
|
||||
# Mappings from symbol to numeric ID and vice versa:
|
||||
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
||||
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
||||
|
||||
# Regular expression matching text enclosed in curly braces:
|
||||
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
|
||||
|
||||
|
||||
def text_to_sequence(text, cleaner_names):
|
||||
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
||||
|
||||
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
|
||||
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
|
||||
|
||||
Args:
|
||||
text: string to convert to a sequence
|
||||
cleaner_names: names of the cleaner functions to run the text through
|
||||
|
||||
Returns:
|
||||
List of integers corresponding to the symbols in the text
|
||||
'''
|
||||
sequence = []
|
||||
|
||||
# Check for curly braces and treat their contents as ARPAbet:
|
||||
while len(text):
|
||||
m = _curly_re.match(text)
|
||||
if not m:
|
||||
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
|
||||
break
|
||||
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
|
||||
sequence += _arpabet_to_sequence(m.group(2))
|
||||
text = m.group(3)
|
||||
|
||||
# Append EOS token
|
||||
sequence.append(_symbol_to_id['~'])
|
||||
return sequence
|
||||
|
||||
|
||||
def sequence_to_text(sequence):
|
||||
'''Converts a sequence of IDs back to a string'''
|
||||
result = ''
|
||||
for symbol_id in sequence:
|
||||
if symbol_id in _id_to_symbol:
|
||||
s = _id_to_symbol[symbol_id]
|
||||
# Enclose ARPAbet back in curly braces:
|
||||
if len(s) > 1 and s[0] == '@':
|
||||
s = '{%s}' % s[1:]
|
||||
result += s
|
||||
return result.replace('}{', ' ')
|
||||
|
||||
|
||||
def _clean_text(text, cleaner_names):
|
||||
for name in cleaner_names:
|
||||
cleaner = getattr(cleaners, name)
|
||||
if not cleaner:
|
||||
raise Exception('Unknown cleaner: %s' % name)
|
||||
text = cleaner(text)
|
||||
return text
|
||||
|
||||
|
||||
def _symbols_to_sequence(symbols):
|
||||
return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
|
||||
|
||||
|
||||
def _arpabet_to_sequence(text):
|
||||
return _symbols_to_sequence(['@' + s for s in text.split()])
|
||||
|
||||
|
||||
def _should_keep_symbol(s):
|
||||
return s in _symbol_to_id and s is not '_' and s is not '~'
|
|
@ -1,104 +0,0 @@
|
|||
'''
|
||||
Cleaners are transformations that run over the input text at both training and
|
||||
eval time.
|
||||
|
||||
Cleaners can be selected by passing a comma-delimited list of cleaner names as
|
||||
the "cleaners" hyperparameter. Some cleaners are English-specific. You'll
|
||||
typically want to use:
|
||||
1. "english_cleaners" for English text
|
||||
2. "transliteration_cleaners" for non-English text that can be transliterated
|
||||
to ASCII using the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
||||
3. "basic_cleaners" if you do not want to transliterate (in this case, you
|
||||
should also update the symbols in symbols.py to match your data).
|
||||
'''
|
||||
|
||||
import re
|
||||
from unidecode import unidecode
|
||||
from .numbers import normalize_numbers
|
||||
|
||||
# Regular expression matching whitespace:
|
||||
_whitespace_re = re.compile(r'\s+')
|
||||
|
||||
# List of (regular expression, replacement) pairs for abbreviations:
|
||||
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
('mrs', 'misess'),
|
||||
('mr', 'mister'),
|
||||
('dr', 'doctor'),
|
||||
('st', 'saint'),
|
||||
('co', 'company'),
|
||||
('jr', 'junior'),
|
||||
('maj', 'major'),
|
||||
('gen', 'general'),
|
||||
('drs', 'doctors'),
|
||||
('rev', 'reverend'),
|
||||
('lt', 'lieutenant'),
|
||||
('hon', 'honorable'),
|
||||
('sgt', 'sergeant'),
|
||||
('capt', 'captain'),
|
||||
('esq', 'esquire'),
|
||||
('ltd', 'limited'),
|
||||
('col', 'colonel'),
|
||||
('ft', 'fort'),
|
||||
]]
|
||||
|
||||
|
||||
def expand_abbreviations(text):
|
||||
for regex, replacement in _abbreviations:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def expand_numbers(text):
|
||||
return normalize_numbers(text)
|
||||
|
||||
|
||||
def lowercase(text):
|
||||
return text.lower()
|
||||
|
||||
|
||||
def collapse_whitespace(text):
|
||||
return re.sub(_whitespace_re, ' ', text)
|
||||
|
||||
|
||||
def convert_to_ascii(text):
|
||||
return unidecode(text)
|
||||
|
||||
|
||||
def add_punctuation(text):
|
||||
if len(text) == 0:
|
||||
return text
|
||||
if text[-1] not in '!,.:;?':
|
||||
text = text + '.' # without this decoder is confused when to output EOS
|
||||
return text
|
||||
|
||||
|
||||
def basic_cleaners(text):
|
||||
'''
|
||||
Basic pipeline that lowercases and collapses whitespace without
|
||||
transliteration.
|
||||
'''
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def transliteration_cleaners(text):
|
||||
'''Pipeline for non-English text that transliterates to ASCII.'''
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def english_cleaners(text):
|
||||
'''
|
||||
Pipeline for English text, including number and abbreviation expansion.
|
||||
'''
|
||||
text = convert_to_ascii(text)
|
||||
text = add_punctuation(text)
|
||||
text = lowercase(text)
|
||||
text = expand_numbers(text)
|
||||
text = expand_abbreviations(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
|
@ -1,67 +0,0 @@
|
|||
import re
|
||||
|
||||
valid_symbols = [
|
||||
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
|
||||
'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
|
||||
'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
|
||||
'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1',
|
||||
'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW',
|
||||
'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T',
|
||||
'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y',
|
||||
'Z', 'ZH'
|
||||
]
|
||||
|
||||
_valid_symbol_set = set(valid_symbols)
|
||||
|
||||
|
||||
class CMUDict:
|
||||
'''
|
||||
Thin wrapper around CMUDict data.
|
||||
http://www.speech.cs.cmu.edu/cgi-bin/cmudict
|
||||
'''
|
||||
|
||||
def __init__(self, file_or_path, keep_ambiguous=True):
|
||||
if isinstance(file_or_path, str):
|
||||
with open(file_or_path, encoding='latin-1') as f:
|
||||
entries = _parse_cmudict(f)
|
||||
else:
|
||||
entries = _parse_cmudict(file_or_path)
|
||||
if not keep_ambiguous:
|
||||
entries = {
|
||||
word: pron
|
||||
for word, pron in entries.items() if len(pron) == 1
|
||||
}
|
||||
self._entries = entries
|
||||
|
||||
def __len__(self):
|
||||
return len(self._entries)
|
||||
|
||||
def lookup(self, word):
|
||||
'''Returns list of ARPAbet pronunciations of the given word.'''
|
||||
return self._entries.get(word.upper())
|
||||
|
||||
|
||||
_alt_re = re.compile(r'\([0-9]+\)')
|
||||
|
||||
|
||||
def _parse_cmudict(file):
|
||||
cmudict = {}
|
||||
for line in file:
|
||||
if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
|
||||
parts = line.split(' ')
|
||||
word = re.sub(_alt_re, '', parts[0])
|
||||
pronunciation = _get_pronunciation(parts[1])
|
||||
if pronunciation:
|
||||
if word in cmudict:
|
||||
cmudict[word].append(pronunciation)
|
||||
else:
|
||||
cmudict[word] = [pronunciation]
|
||||
return cmudict
|
||||
|
||||
|
||||
def _get_pronunciation(s):
|
||||
parts = s.strip().split(' ')
|
||||
for part in parts:
|
||||
if part not in _valid_symbol_set:
|
||||
return None
|
||||
return ' '.join(parts)
|
|
@ -1,71 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import inflect
|
||||
import re
|
||||
|
||||
_inflect = inflect.engine()
|
||||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
||||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
||||
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
|
||||
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
|
||||
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
|
||||
_number_re = re.compile(r'[0-9]+')
|
||||
|
||||
|
||||
def _remove_commas(m):
|
||||
return m.group(1).replace(',', '')
|
||||
|
||||
|
||||
def _expand_decimal_point(m):
|
||||
return m.group(1).replace('.', ' point ')
|
||||
|
||||
|
||||
def _expand_dollars(m):
|
||||
match = m.group(1)
|
||||
parts = match.split('.')
|
||||
if len(parts) > 2:
|
||||
return match + ' dollars' # Unexpected format
|
||||
dollars = int(parts[0]) if parts[0] else 0
|
||||
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
||||
if dollars and cents:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
|
||||
elif dollars:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
return '%s %s' % (dollars, dollar_unit)
|
||||
elif cents:
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s' % (cents, cent_unit)
|
||||
else:
|
||||
return 'zero dollars'
|
||||
|
||||
|
||||
def _expand_ordinal(m):
|
||||
return _inflect.number_to_words(m.group(0))
|
||||
|
||||
|
||||
def _expand_number(m):
|
||||
num = int(m.group(0))
|
||||
if num > 1000 and num < 3000:
|
||||
if num == 2000:
|
||||
return 'two thousand'
|
||||
elif num > 2000 and num < 2010:
|
||||
return 'two thousand ' + _inflect.number_to_words(num % 100)
|
||||
elif num % 100 == 0:
|
||||
return _inflect.number_to_words(num // 100) + ' hundred'
|
||||
else:
|
||||
return _inflect.number_to_words(
|
||||
num, andword='', zero='oh', group=2).replace(', ', ' ')
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='')
|
||||
|
||||
|
||||
def normalize_numbers(text):
|
||||
text = re.sub(_comma_number_re, _remove_commas, text)
|
||||
text = re.sub(_pounds_re, r'\1 pounds', text)
|
||||
text = re.sub(_dollars_re, _expand_dollars, text)
|
||||
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
||||
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
||||
text = re.sub(_number_re, _expand_number, text)
|
||||
return text
|
|
@ -1,18 +0,0 @@
|
|||
'''
|
||||
Defines the set of symbols used in text input to the model.
|
||||
|
||||
The default is a set of ASCII characters that works well for English or text
|
||||
that has been run through Unidecode. For other data, you can modify _characters.
|
||||
See TRAINING_DATA.md for details.
|
||||
'''
|
||||
from .cmudict import valid_symbols
|
||||
|
||||
_pad = '_'
|
||||
_eos = '~'
|
||||
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
|
||||
|
||||
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
||||
_arpabet = ['@' + s for s in valid_symbols]
|
||||
|
||||
# Export all symbols:
|
||||
symbols = [_pad, _eos] + list(_characters) + _arpabet
|
158
modules/loss.py
158
modules/loss.py
|
@ -1,158 +0,0 @@
|
|||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
from numba import jit
|
||||
|
||||
from paddle import fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
||||
|
||||
def masked_mean(inputs, mask):
|
||||
"""
|
||||
Args:
|
||||
inputs (Variable): Shape(B, C, 1, T), the input, where B means
|
||||
batch size, C means channels of input, T means timesteps of
|
||||
the input.
|
||||
mask (Variable): Shape(B, T), a mask.
|
||||
Returns:
|
||||
loss (Variable): Shape(1, ), masked mean.
|
||||
"""
|
||||
channels = inputs.shape[1]
|
||||
reshaped_mask = fluid.layers.reshape(
|
||||
mask, shape=[mask.shape[0], 1, 1, mask.shape[-1]])
|
||||
expanded_mask = fluid.layers.expand(
|
||||
reshaped_mask, expand_times=[1, channels, 1, 1])
|
||||
expanded_mask.stop_gradient = True
|
||||
|
||||
valid_cnt = fluid.layers.reduce_sum(expanded_mask)
|
||||
valid_cnt.stop_gradient = True
|
||||
|
||||
masked_inputs = inputs * expanded_mask
|
||||
loss = fluid.layers.reduce_sum(masked_inputs) / valid_cnt
|
||||
return loss
|
||||
|
||||
|
||||
@jit(nopython=True)
|
||||
def guided_attention(N, max_N, T, max_T, g):
|
||||
W = np.zeros((max_N, max_T), dtype=np.float32)
|
||||
for n in range(N):
|
||||
for t in range(T):
|
||||
W[n, t] = 1 - np.exp(-(n / N - t / T)**2 / (2 * g * g))
|
||||
return W
|
||||
|
||||
|
||||
def guided_attentions(input_lengths, target_lengths, max_target_len, g=0.2):
|
||||
B = len(input_lengths)
|
||||
max_input_len = input_lengths.max()
|
||||
W = np.zeros((B, max_target_len, max_input_len), dtype=np.float32)
|
||||
for b in range(B):
|
||||
W[b] = guided_attention(input_lengths[b], max_input_len,
|
||||
target_lengths[b], max_target_len, g).T
|
||||
return W
|
||||
|
||||
|
||||
class TTSLoss(object):
|
||||
def __init__(self,
|
||||
masked_weight=0.0,
|
||||
priority_weight=0.0,
|
||||
binary_divergence_weight=0.0,
|
||||
guided_attention_sigma=0.2):
|
||||
self.masked_weight = masked_weight
|
||||
self.priority_weight = priority_weight
|
||||
self.binary_divergence_weight = binary_divergence_weight
|
||||
self.guided_attention_sigma = guided_attention_sigma
|
||||
|
||||
def l1_loss(self, prediction, target, mask, priority_bin=None):
|
||||
abs_diff = fluid.layers.abs(prediction - target)
|
||||
|
||||
# basic mask-weighted l1 loss
|
||||
w = self.masked_weight
|
||||
if w > 0 and mask is not None:
|
||||
base_l1_loss = w * masked_mean(abs_diff, mask) + (
|
||||
1 - w) * fluid.layers.reduce_mean(abs_diff)
|
||||
else:
|
||||
base_l1_loss = fluid.layers.reduce_mean(abs_diff)
|
||||
|
||||
if self.priority_weight > 0 and priority_bin is not None:
|
||||
# mask-weighted priority channels' l1-loss
|
||||
priority_abs_diff = fluid.layers.slice(
|
||||
abs_diff, axes=[1], starts=[0], ends=[priority_bin])
|
||||
if w > 0 and mask is not None:
|
||||
priority_loss = w * masked_mean(priority_abs_diff, mask) + (
|
||||
1 - w) * fluid.layers.reduce_mean(priority_abs_diff)
|
||||
else:
|
||||
priority_loss = fluid.layers.reduce_mean(priority_abs_diff)
|
||||
|
||||
# priority weighted sum
|
||||
p = self.priority_weight
|
||||
loss = p * priority_loss + (1 - p) * base_l1_loss
|
||||
else:
|
||||
loss = base_l1_loss
|
||||
return loss
|
||||
|
||||
def binary_divergence(self, prediction, target, mask):
|
||||
flattened_prediction = fluid.layers.reshape(prediction, [-1, 1])
|
||||
flattened_target = fluid.layers.reshape(target, [-1, 1])
|
||||
flattened_loss = fluid.layers.log_loss(
|
||||
flattened_prediction, flattened_target, epsilon=1e-8)
|
||||
bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)
|
||||
|
||||
w = self.masked_weight
|
||||
if w > 0 and mask is not None:
|
||||
loss = w * masked_mean(bin_div, mask) + (
|
||||
1 - w) * fluid.layers.reduce_mean(bin_div)
|
||||
else:
|
||||
loss = fluid.layers.reduce_mean(bin_div)
|
||||
return loss
|
||||
|
||||
@staticmethod
|
||||
def done_loss(done_hat, done):
|
||||
flat_done_hat = fluid.layers.reshape(done_hat, [-1, 1])
|
||||
flat_done = fluid.layers.reshape(done, [-1, 1])
|
||||
loss = fluid.layers.log_loss(flat_done_hat, flat_done, epsilon=1e-8)
|
||||
loss = fluid.layers.reduce_mean(loss)
|
||||
return loss
|
||||
|
||||
def attention_loss(self, predicted_attention, input_lengths,
|
||||
target_lengths):
|
||||
"""
|
||||
Given valid encoder_lengths and decoder_lengths, compute a diagonal
|
||||
guide, and compute loss from the predicted attention and the guide.
|
||||
|
||||
Args:
|
||||
predicted_attention (Variable): Shape(*, B, T_dec, T_enc), the
|
||||
alignment tensor, where B means batch size, T_dec means number
|
||||
of time steps of the decoder, T_enc means the number of time
|
||||
steps of the encoder, * means other possible dimensions.
|
||||
input_lengths (numpy.ndarray): Shape(B,), dtype:int64, valid lengths
|
||||
(time steps) of encoder outputs.
|
||||
target_lengths (numpy.ndarray): Shape(batch_size,), dtype:int64,
|
||||
valid lengths (time steps) of decoder outputs.
|
||||
|
||||
Returns:
|
||||
loss (Variable): Shape(1, ) attention loss.
|
||||
"""
|
||||
n_attention, batch_size, max_target_len, max_input_len = (
|
||||
predicted_attention.shape)
|
||||
soft_mask = guided_attentions(input_lengths, target_lengths,
|
||||
max_target_len,
|
||||
self.guided_attention_sigma)
|
||||
soft_mask_ = dg.to_variable(soft_mask)
|
||||
loss = fluid.layers.reduce_mean(predicted_attention * soft_mask_)
|
||||
return loss
|
|
@ -1,458 +0,0 @@
|
|||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
from paddle import fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
||||
import numpy as np
|
||||
|
||||
import conv
|
||||
import weight_norm as weight_norm
|
||||
|
||||
|
||||
def FC(name_scope,
|
||||
in_features,
|
||||
size,
|
||||
num_flatten_dims=1,
|
||||
dropout=0.0,
|
||||
epsilon=1e-30,
|
||||
act=None,
|
||||
is_test=False,
|
||||
dtype="float32"):
|
||||
"""
|
||||
A special Linear Layer, when it is used with dropout, the weight is
|
||||
initialized as normal(0, std=np.sqrt((1-dropout) / in_features))
|
||||
"""
|
||||
|
||||
# stds
|
||||
if isinstance(in_features, int):
|
||||
in_features = [in_features]
|
||||
stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features]
|
||||
weight_inits = [
|
||||
fluid.initializer.NormalInitializer(scale=std) for std in stds
|
||||
]
|
||||
bias_init = fluid.initializer.ConstantInitializer(0.0)
|
||||
|
||||
# param attrs
|
||||
weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits]
|
||||
bias_attr = fluid.ParamAttr(initializer=bias_init)
|
||||
|
||||
layer = weight_norm.FC(name_scope,
|
||||
size,
|
||||
num_flatten_dims=num_flatten_dims,
|
||||
param_attr=weight_attrs,
|
||||
bias_attr=bias_attr,
|
||||
act=act,
|
||||
dtype=dtype)
|
||||
return layer
|
||||
|
||||
|
||||
def Conv1D(name_scope,
|
||||
in_channels,
|
||||
num_filters,
|
||||
filter_size=3,
|
||||
dilation=1,
|
||||
groups=None,
|
||||
causal=False,
|
||||
std_mul=1.0,
|
||||
dropout=0.0,
|
||||
use_cudnn=True,
|
||||
act=None,
|
||||
dtype="float32"):
|
||||
"""
|
||||
A special Conv1D Layer, when it is used with dropout, the weight is
|
||||
initialized as
|
||||
normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_features)))
|
||||
"""
|
||||
# std
|
||||
std = np.sqrt((std_mul * (1 - dropout)) / (filter_size * in_channels))
|
||||
weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std)
|
||||
bias_init = fluid.initializer.ConstantInitializer(0.0)
|
||||
|
||||
# param attrs
|
||||
weight_attr = fluid.ParamAttr(initializer=weight_init)
|
||||
bias_attr = fluid.ParamAttr(initializer=bias_init)
|
||||
|
||||
layer = conv.Conv1D(
|
||||
name_scope,
|
||||
in_channels,
|
||||
num_filters,
|
||||
filter_size,
|
||||
dilation,
|
||||
groups=groups,
|
||||
causal=causal,
|
||||
param_attr=weight_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_cudnn=use_cudnn,
|
||||
act=act,
|
||||
dtype=dtype)
|
||||
return layer
|
||||
|
||||
|
||||
def Embedding(name_scope,
|
||||
num_embeddings,
|
||||
embed_dim,
|
||||
is_sparse=False,
|
||||
is_distributed=False,
|
||||
padding_idx=None,
|
||||
std=0.01,
|
||||
dtype="float32"):
|
||||
# param attrs
|
||||
weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(
|
||||
scale=std))
|
||||
layer = dg.Embedding(
|
||||
name_scope, (num_embeddings, embed_dim),
|
||||
padding_idx=padding_idx,
|
||||
param_attr=weight_attr,
|
||||
dtype=dtype)
|
||||
return layer
|
||||
|
||||
|
||||
class Conv1DGLU(dg.Layer):
|
||||
"""
|
||||
A Convolution 1D block with GLU activation. It also applys dropout for the
|
||||
input x. It fuses speaker embeddings through a FC activated by softsign. It
|
||||
has residual connection from the input x, and scale the output by
|
||||
np.sqrt(0.5).
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
name_scope,
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
in_channels,
|
||||
num_filters,
|
||||
filter_size,
|
||||
dilation,
|
||||
std_mul=4.0,
|
||||
dropout=0.0,
|
||||
causal=False,
|
||||
residual=True,
|
||||
dtype="float32"):
|
||||
super(Conv1DGLU, self).__init__(name_scope, dtype=dtype)
|
||||
|
||||
# conv spec
|
||||
self.in_channels = in_channels
|
||||
self.n_speakers = n_speakers
|
||||
self.speaker_dim = speaker_dim
|
||||
self.num_filters = num_filters
|
||||
self.filter_size = filter_size
|
||||
self.dilation = dilation
|
||||
self.causal = causal
|
||||
self.residual = residual
|
||||
|
||||
# weight init and dropout
|
||||
self.std_mul = std_mul
|
||||
self.dropout = dropout
|
||||
|
||||
if residual:
|
||||
assert (
|
||||
in_channels == num_filters
|
||||
), "this block uses residual connection"\
|
||||
"the input_channes should equals num_filters"
|
||||
|
||||
self.conv = Conv1D(
|
||||
self.full_name(),
|
||||
in_channels,
|
||||
2 * num_filters,
|
||||
filter_size,
|
||||
dilation,
|
||||
causal=causal,
|
||||
std_mul=std_mul,
|
||||
dropout=dropout,
|
||||
dtype=dtype)
|
||||
|
||||
if n_speakers > 1:
|
||||
assert (speaker_dim is not None
|
||||
), "speaker embed should not be null in multi-speaker case"
|
||||
self.fc = Conv1D(
|
||||
self.full_name(),
|
||||
speaker_dim,
|
||||
num_filters,
|
||||
filter_size=1,
|
||||
dilation=1,
|
||||
causal=False,
|
||||
act="softsign",
|
||||
dtype=dtype)
|
||||
|
||||
def forward(self, x, speaker_embed_bc1t=None):
|
||||
"""
|
||||
Args:
|
||||
x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU
|
||||
layer, where B means batch_size, C_in means the input channels
|
||||
T means input time steps.
|
||||
speaker_embed_bct1 (Variable): Shape(B, C_sp, 1, T), expanded
|
||||
speaker embed, where C_sp means speaker embedding size. Note
|
||||
that when using residual connection, the Conv1DGLU does not
|
||||
change the number of channels, so out channels equals input
|
||||
channels.
|
||||
|
||||
Returns:
|
||||
x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where
|
||||
C_out means the output channels of Conv1DGLU.
|
||||
"""
|
||||
|
||||
residual = x
|
||||
x = fluid.layers.dropout(
|
||||
x, self.dropout, dropout_implementation="upscale_in_train")
|
||||
x = self.conv(x)
|
||||
|
||||
content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
|
||||
|
||||
if speaker_embed_bc1t is not None:
|
||||
sp = self.fc(speaker_embed_bc1t)
|
||||
content = content + sp
|
||||
|
||||
# glu
|
||||
x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)
|
||||
|
||||
if self.residual:
|
||||
x = fluid.layers.scale(x + residual, np.sqrt(0.5))
|
||||
return x
|
||||
|
||||
def add_input(self, x, speaker_embed_bc11=None):
|
||||
"""
|
||||
Inputs:
|
||||
x: shape(B, num_filters, 1, time_steps)
|
||||
speaker_embed_bc11: shape(B, speaker_dim, 1, time_steps)
|
||||
|
||||
Outputs:
|
||||
out: shape(B, num_filters, 1, time_steps), where time_steps = 1
|
||||
"""
|
||||
|
||||
residual = x
|
||||
|
||||
# add step input and produce step output
|
||||
x = fluid.layers.dropout(
|
||||
x, self.dropout, dropout_implementation="upscale_in_train")
|
||||
x = self.conv.add_input(x)
|
||||
|
||||
content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
|
||||
|
||||
if speaker_embed_bc11 is not None:
|
||||
sp = self.fc(speaker_embed_bc11)
|
||||
content = content + sp
|
||||
|
||||
x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)
|
||||
|
||||
if self.residual:
|
||||
x = fluid.layers.scale(x + residual, np.sqrt(0.5))
|
||||
return x
|
||||
|
||||
|
||||
def Conv1DTranspose(name_scope,
|
||||
in_channels,
|
||||
num_filters,
|
||||
filter_size,
|
||||
padding=0,
|
||||
stride=1,
|
||||
dilation=1,
|
||||
groups=None,
|
||||
std_mul=1.0,
|
||||
dropout=0.0,
|
||||
use_cudnn=True,
|
||||
act=None,
|
||||
dtype="float32"):
|
||||
std = np.sqrt(std_mul * (1 - dropout) / (in_channels * filter_size))
|
||||
weight_init = fluid.initializer.NormalInitializer(scale=std)
|
||||
weight_attr = fluid.ParamAttr(initializer=weight_init)
|
||||
bias_init = fluid.initializer.ConstantInitializer(0.0)
|
||||
bias_attr = fluid.ParamAttr(initializer=bias_init)
|
||||
layer = conv.Conv1DTranspose(
|
||||
name_scope,
|
||||
in_channels,
|
||||
num_filters,
|
||||
filter_size,
|
||||
padding=padding,
|
||||
stride=stride,
|
||||
dilation=dilation,
|
||||
groups=groups,
|
||||
param_attr=weight_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_cudnn=use_cudnn,
|
||||
act=act,
|
||||
dtype=dtype)
|
||||
return layer
|
||||
|
||||
|
||||
def compute_position_embedding(rad):
|
||||
# rad is a transposed radius, shape(embed_dim, n_vocab)
|
||||
embed_dim, n_vocab = rad.shape
|
||||
|
||||
even_dims = dg.to_variable(np.arange(0, embed_dim, 2).astype("int32"))
|
||||
odd_dims = dg.to_variable(np.arange(1, embed_dim, 2).astype("int32"))
|
||||
|
||||
even_rads = fluid.layers.gather(rad, even_dims)
|
||||
odd_rads = fluid.layers.gather(rad, odd_dims)
|
||||
|
||||
sines = fluid.layers.sin(even_rads)
|
||||
cosines = fluid.layers.cos(odd_rads)
|
||||
|
||||
temp = fluid.layers.scatter(rad, even_dims, sines)
|
||||
out = fluid.layers.scatter(temp, odd_dims, cosines)
|
||||
out = fluid.layers.transpose(out, perm=[1, 0])
|
||||
return out
|
||||
|
||||
|
||||
def position_encoding_init(n_position,
|
||||
d_pos_vec,
|
||||
position_rate=1.0,
|
||||
sinusoidal=True):
|
||||
""" Init the sinusoid position encoding table """
|
||||
|
||||
# keep idx 0 for padding token position encoding zero vector
|
||||
position_enc = np.array([[
|
||||
position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec)
|
||||
for i in range(d_pos_vec)
|
||||
] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
|
||||
|
||||
if sinusoidal:
|
||||
position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
|
||||
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
|
||||
|
||||
return position_enc
|
||||
|
||||
|
||||
class PositionEmbedding(dg.Layer):
|
||||
def __init__(self,
|
||||
name_scope,
|
||||
n_position,
|
||||
d_pos_vec,
|
||||
position_rate=1.0,
|
||||
is_sparse=False,
|
||||
is_distributed=False,
|
||||
param_attr=None,
|
||||
max_norm=None,
|
||||
padding_idx=None,
|
||||
dtype="float32"):
|
||||
super(PositionEmbedding, self).__init__(name_scope, dtype=dtype)
|
||||
self.embed = dg.Embedding(
|
||||
self.full_name(),
|
||||
size=(n_position, d_pos_vec),
|
||||
is_sparse=is_sparse,
|
||||
is_distributed=is_distributed,
|
||||
padding_idx=None,
|
||||
param_attr=param_attr,
|
||||
dtype=dtype)
|
||||
self.set_weight(
|
||||
position_encoding_init(
|
||||
n_position,
|
||||
d_pos_vec,
|
||||
position_rate=position_rate,
|
||||
sinusoidal=False).astype(dtype))
|
||||
|
||||
self._is_sparse = is_sparse
|
||||
self._is_distributed = is_distributed
|
||||
self._remote_prefetch = self._is_sparse and (not self._is_distributed)
|
||||
if self._remote_prefetch:
|
||||
assert self._is_sparse is True and self._is_distributed is False
|
||||
|
||||
self._padding_idx = (-1 if padding_idx is None else padding_idx if
|
||||
padding_idx >= 0 else (n_position + padding_idx))
|
||||
self._position_rate = position_rate
|
||||
self._max_norm = max_norm
|
||||
self._dtype = dtype
|
||||
|
||||
def set_weight(self, array):
|
||||
assert self.embed._w.shape == list(array.shape), "shape does not match"
|
||||
self.embed._w._ivar.value().get_tensor().set(
|
||||
array, fluid.framework._current_expected_place())
|
||||
|
||||
def forward(self, indices, speaker_position_rate=None):
|
||||
"""
|
||||
Args:
|
||||
indices (Variable): Shape (B, T, 1), dtype: int64, position
|
||||
indices, where B means the batch size, T means the time steps.
|
||||
speaker_position_rate (Variable | float, optional), position
|
||||
rate. It can be a float point number or a Variable with
|
||||
shape (1,), then this speaker_position_rate is used for every
|
||||
example. It can also be a Variable with shape (B, 1), which
|
||||
contains a speaker position rate for each speaker.
|
||||
Returns:
|
||||
out (Variable): Shape(B, C_pos), position embedding, where C_pos
|
||||
means position embedding size.
|
||||
"""
|
||||
rad = fluid.layers.transpose(self.embed._w, perm=[1, 0])
|
||||
batch_size = indices.shape[0]
|
||||
|
||||
if speaker_position_rate is None:
|
||||
weight = compute_position_embedding(rad)
|
||||
out = self._helper.create_variable_for_type_inference(self._dtype)
|
||||
self._helper.append_op(
|
||||
type="lookup_table",
|
||||
inputs={"Ids": indices,
|
||||
"W": weight},
|
||||
outputs={"Out": out},
|
||||
attrs={
|
||||
"is_sparse": self._is_sparse,
|
||||
"is_distributed": self._is_distributed,
|
||||
"remote_prefetch": self._remote_prefetch,
|
||||
"padding_idx":
|
||||
self._padding_idx, # special value for lookup table op
|
||||
})
|
||||
return out
|
||||
|
||||
elif (np.isscalar(speaker_position_rate) or
|
||||
isinstance(speaker_position_rate, fluid.framework.Variable) and
|
||||
speaker_position_rate.shape == [1, 1]):
|
||||
# # make a weight
|
||||
# scale the weight (the operand for sin & cos)
|
||||
if np.isscalar(speaker_position_rate):
|
||||
scaled_rad = fluid.layers.scale(rad, speaker_position_rate)
|
||||
else:
|
||||
scaled_rad = fluid.layers.elementwise_mul(
|
||||
rad, speaker_position_rate[0])
|
||||
weight = compute_position_embedding(scaled_rad)
|
||||
out = self._helper.create_variable_for_type_inference(self._dtype)
|
||||
self._helper.append_op(
|
||||
type="lookup_table",
|
||||
inputs={"Ids": indices,
|
||||
"W": weight},
|
||||
outputs={"Out": out},
|
||||
attrs={
|
||||
"is_sparse": self._is_sparse,
|
||||
"is_distributed": self._is_distributed,
|
||||
"remote_prefetch": self._remote_prefetch,
|
||||
"padding_idx":
|
||||
self._padding_idx, # special value for lookup table op
|
||||
})
|
||||
return out
|
||||
|
||||
elif np.prod(speaker_position_rate.shape) > 1:
|
||||
assert speaker_position_rate.shape == [batch_size, 1]
|
||||
outputs = []
|
||||
for i in range(batch_size):
|
||||
rate = speaker_position_rate[i] # rate has shape [1]
|
||||
scaled_rad = fluid.layers.elementwise_mul(rad, rate)
|
||||
weight = compute_position_embedding(scaled_rad)
|
||||
out = self._helper.create_variable_for_type_inference(
|
||||
self._dtype)
|
||||
sequence = indices[i]
|
||||
self._helper.append_op(
|
||||
type="lookup_table",
|
||||
inputs={"Ids": sequence,
|
||||
"W": weight},
|
||||
outputs={"Out": out},
|
||||
attrs={
|
||||
"is_sparse": self._is_sparse,
|
||||
"is_distributed": self._is_distributed,
|
||||
"remote_prefetch": self._remote_prefetch,
|
||||
"padding_idx": -1,
|
||||
})
|
||||
outputs.append(out)
|
||||
out = fluid.layers.stack(outputs)
|
||||
return out
|
||||
else:
|
||||
raise Exception("Then you can just use position rate at init")
|
|
@ -1,863 +0,0 @@
|
|||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
from six.moves import reduce
|
||||
|
||||
from copy import deepcopy
|
||||
|
||||
import paddle
|
||||
from paddle import fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
from paddle.fluid import core
|
||||
from paddle.fluid.layers import utils
|
||||
from paddle.fluid.framework import Variable
|
||||
from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
|
||||
|
||||
|
||||
def _norm(p, dim):
|
||||
"""Computes the norm over all dimensions except dim.
|
||||
It differs from pytorch implementation that it does not keep dim.
|
||||
This difference is related with the broadcast mechanism in paddle.
|
||||
Read elementeise_mul for more.
|
||||
"""
|
||||
|
||||
if dim is None:
|
||||
return np.linalg.norm(p, ord=2, axis=None)
|
||||
elif dim == 0:
|
||||
p = np.reshape(p, newshape=(p.shape[0], -1))
|
||||
return np.linalg.norm(p, ord=2, axis=1)
|
||||
elif dim == p.ndim - 1:
|
||||
p = np.reshape(p, newshape=(-1, p.shape[-1]))
|
||||
return np.linalg.norm(p, ord=2, axis=0)
|
||||
else:
|
||||
perm = list(range(p.ndim))
|
||||
perm[0] = dim
|
||||
perm[dim] = 0
|
||||
return _norm(np.transpose(p, axes=perm))
|
||||
|
||||
|
||||
class FC(dg.Layer):
|
||||
"""
|
||||
**Fully Connected Layer**
|
||||
|
||||
This function creates a fully connected layer in the network. It can take
|
||||
one or multiple tensors as its inputs(input can be a list of Variable, see
|
||||
Args in detail). It creates a pair of variables called (magnitude(g),
|
||||
direction(V)) for each input tensor. Elementwise_mul(V, g) represents a fully connected
|
||||
weight matrix from each input unit to each output unit.
|
||||
The fully connected layer multiplies each input tensor
|
||||
with its corresponding weight to produce an output Tensor with shape [M, `size`],
|
||||
where M is batch size. If multiple input tensors are given, the results of
|
||||
multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
|
||||
is not None, a bias variable will be created and added to the output.
|
||||
Finally, if activation is not None, it will be applied to the output as well.
|
||||
|
||||
When the input is single tensor:
|
||||
|
||||
.. math::
|
||||
|
||||
Out = Act({X(normalize(V)g) + b})
|
||||
|
||||
When the input are multiple tensors:
|
||||
|
||||
.. math::
|
||||
|
||||
Out = Act({\sum_{i=0}^{N-1}X_i(V_ig_i) + b})
|
||||
|
||||
In the above equation:
|
||||
|
||||
* :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
|
||||
* :math:`X_i`: The i-th input tensor.
|
||||
* :math:`V_i`: The i-th direction matrix corresponding i-th input tensor.
|
||||
* :math:`g_i`: The i-th magnitude vector corresponding i-th input tensor.
|
||||
* :math:`b`: The bias parameter created by this layer (if needed).
|
||||
* :math:`Act`: The activation function.
|
||||
* :math:`Out`: The output tensor.
|
||||
|
||||
See below for an example.
|
||||
|
||||
.. code-block:: text
|
||||
|
||||
Given:
|
||||
data_1.data = [[[0.1, 0.2],
|
||||
[0.3, 0.4]]]
|
||||
data_1.shape = (1, 2, 2) # 1 is batch_size
|
||||
|
||||
data_2 = [[[0.1, 0.2, 0.3]]]
|
||||
data_2.shape = (1, 1, 3)
|
||||
|
||||
out = fluid.layers.fc(input=[data_1, data_2], size=2)
|
||||
|
||||
Then:
|
||||
out.data = [[0.18669507, 0.1893476]]
|
||||
out.shape = (1, 2)
|
||||
|
||||
Args:
|
||||
name_scope(str): The name of this class.
|
||||
size(int): The number of output units in this layer.
|
||||
num_flatten_dims (int): The fc layer can accept an input tensor with more than
|
||||
two dimensions. If this happens, the multidimensional tensor will first be flattened
|
||||
into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
|
||||
tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
|
||||
dimensions will be flatten to form the first dimension of the final matrix (height of
|
||||
the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
|
||||
form the second dimension of the final matrix (width of the matrix). For example, suppose
|
||||
`X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
|
||||
Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1
|
||||
param_attr (ParamAttr|list of ParamAttr|None): The parameter attribute for learnable
|
||||
parameters/weights of this layer.
|
||||
bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
|
||||
of this layer. If it is set to False, no bias will be added to the output units.
|
||||
If it is set to None, the bias is initialized zero. Default: None.
|
||||
act (str|None): Activation to be applied to the output of this layer.
|
||||
is_test(bool): A flag indicating whether execution is in test phase. Default: False
|
||||
dtype(str): Dtype used for weight
|
||||
|
||||
Raises:
|
||||
ValueError: If rank of the input tensor is less than 2.
|
||||
|
||||
Examples:
|
||||
.. code-block:: python
|
||||
|
||||
from paddle.fluid.dygraph.base import to_variable
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.dygraph import FC
|
||||
import numpy as np
|
||||
|
||||
data = np.random.uniform( -1, 1, [30, 10, 32] ).astype('float32')
|
||||
with fluid.dygraph.guard():
|
||||
fc = FC( "fc", 64, num_flatten_dims=2)
|
||||
data = to_variable( data )
|
||||
conv = fc( data )
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
name_scope,
|
||||
size,
|
||||
num_flatten_dims=1,
|
||||
epsilon=1e-30,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
act=None,
|
||||
is_test=False,
|
||||
dtype="float32"):
|
||||
super(FC, self).__init__(name_scope, dtype)
|
||||
|
||||
self._size = size
|
||||
self._num_flatten_dims = num_flatten_dims
|
||||
self._epsilon = epsilon
|
||||
self._dtype = dtype
|
||||
self._param_attr = param_attr
|
||||
self._bias_attr = bias_attr
|
||||
self._act = act
|
||||
self.__g = list()
|
||||
self.__v = list()
|
||||
|
||||
@property
|
||||
def _v(self, i=0):
|
||||
return self.__v[i]
|
||||
|
||||
@property
|
||||
def _g(self, i=0):
|
||||
return self.__g[i]
|
||||
|
||||
@_v.setter
|
||||
def _v(self, value, i=0):
|
||||
assert isinstance(value, Parameter)
|
||||
self.__v[i] = value
|
||||
|
||||
@_g.setter
|
||||
def _g(self, value, i=0):
|
||||
assert isinstance(value, Parameter)
|
||||
self.__g[i] = value
|
||||
|
||||
def _build_once(self, input):
|
||||
i = 0
|
||||
for inp, param in self._helper.iter_inputs_and_params(input,
|
||||
self._param_attr):
|
||||
input_shape = inp.shape
|
||||
|
||||
param_shape = [
|
||||
reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:],
|
||||
1)
|
||||
] + [self._size]
|
||||
self.__v.append(
|
||||
self.add_parameter(
|
||||
"_v%d" % i,
|
||||
self.create_parameter(
|
||||
attr=param,
|
||||
shape=param_shape,
|
||||
dtype=self._dtype,
|
||||
is_bias=False)))
|
||||
|
||||
magnitude_shape = param_shape[1:]
|
||||
magnitude_value = np.linalg.norm(self.__v[i].numpy(), ord=2, axis=0)
|
||||
|
||||
self.__g.append(
|
||||
self.add_parameter(
|
||||
"_g%d" % i,
|
||||
self.create_parameter(
|
||||
attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(
|
||||
magnitude_value)),
|
||||
shape=magnitude_shape,
|
||||
dtype=self._dtype,
|
||||
is_bias=False)))
|
||||
i += 1
|
||||
|
||||
size = list([self._size])
|
||||
self._b = self.create_parameter(
|
||||
attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
|
||||
|
||||
def forward(self, input):
|
||||
mul_results = list()
|
||||
i = 0
|
||||
for inp, param in self._helper.iter_inputs_and_params(input,
|
||||
self._param_attr):
|
||||
v_norm = self._helper.create_variable_for_type_inference(
|
||||
self._dtype)
|
||||
v_normalized = self._helper.create_variable_for_type_inference(
|
||||
self._dtype)
|
||||
self._helper.append_op(
|
||||
type="norm",
|
||||
inputs={"X": self.__v[i]},
|
||||
outputs={"Out": v_normalized,
|
||||
"Norm": v_norm},
|
||||
attrs={"axis": 0,
|
||||
"epsilon": self._epsilon})
|
||||
weight = self._helper.create_variable_for_type_inference(
|
||||
self._dtype)
|
||||
self._helper.append_op(
|
||||
type="elementwise_mul",
|
||||
inputs={"X": [v_normalized],
|
||||
"Y": [self.__g[i]]},
|
||||
outputs={"Out": [weight]},
|
||||
attrs={"axis": 1})
|
||||
tmp = self._helper.create_variable_for_type_inference(self._dtype)
|
||||
self._helper.append_op(
|
||||
type="mul",
|
||||
inputs={"X": inp,
|
||||
"Y": weight},
|
||||
outputs={"Out": tmp},
|
||||
attrs={
|
||||
"x_num_col_dims": self._num_flatten_dims,
|
||||
"y_num_col_dims": 1
|
||||
})
|
||||
i += 1
|
||||
mul_results.append(tmp)
|
||||
|
||||
if len(mul_results) == 1:
|
||||
pre_bias = mul_results[0]
|
||||
else:
|
||||
pre_bias = self._helper.create_variable_for_type_inference(
|
||||
self._dtype)
|
||||
self._helper.append_op(
|
||||
type="sum",
|
||||
inputs={"X": mul_results},
|
||||
outputs={"Out": pre_bias},
|
||||
attrs={"use_mkldnn": False})
|
||||
|
||||
if self._b:
|
||||
pre_activation = self._helper.create_variable_for_type_inference(
|
||||
dtype=self._dtype)
|
||||
self._helper.append_op(
|
||||
type="elementwise_add",
|
||||
inputs={"X": [pre_bias],
|
||||
"Y": [self._b]},
|
||||
outputs={"Out": [pre_activation]},
|
||||
attrs={"axis": self._num_flatten_dims})
|
||||
else:
|
||||
pre_activation = pre_bias
|
||||
# Currently, we don't support inplace in dygraph mode
|
||||
return self._helper.append_activation(pre_activation, act=self._act)
|
||||
|
||||
|
||||
class Conv2D(dg.Layer):
|
||||
"""
|
||||
The convolution2D layer calculates the output based on the input, filter
|
||||
and strides, paddings, dilations, groups parameters. Input and
|
||||
Output are in NCHW format, where N is batch size, C is the number of
|
||||
channels, H is the height of the feature, and W is the width of the feature.
|
||||
Filter is in MCHW format, where M is the number of output image channels,
|
||||
C is the number of input image channels, H is the height of the filter,
|
||||
and W is the width of the filter. If the groups is greater than 1,
|
||||
C will equal the number of input image channels divided by the groups.
|
||||
Please refer to UFLDL's `convolution
|
||||
<http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`
|
||||
for more detials.
|
||||
If bias attribution and activation type are provided, bias is added to the
|
||||
output of the convolution, and the corresponding activation function is
|
||||
applied to the final result.
|
||||
|
||||
For each input :math:`X`, the equation is:
|
||||
|
||||
.. math::
|
||||
|
||||
Out = \sigma ((Vg) \\ast X + b)
|
||||
|
||||
Where:
|
||||
|
||||
* :math:`X`: Input value, a tensor with NCHW format.
|
||||
* :math:`V`: Filter direction value, a tensor with MCHW format.
|
||||
* :math:`g`: Filter magnitude value, a tensor with M format.
|
||||
* :math:`\\ast`: Convolution operation.
|
||||
* :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
|
||||
* :math:`\\sigma`: Activation function.
|
||||
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
|
||||
|
||||
Example:
|
||||
|
||||
- Input:
|
||||
|
||||
Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
|
||||
|
||||
Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
|
||||
|
||||
- Output:
|
||||
|
||||
Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
|
||||
|
||||
Where
|
||||
|
||||
.. math::
|
||||
|
||||
H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
|
||||
W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
|
||||
|
||||
Args:
|
||||
name_scope(str) : The name for this class.
|
||||
num_filters(int): The number of filter. It is as same as the output
|
||||
image channel.
|
||||
filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
|
||||
it must contain two integers, (filter_size_H, filter_size_W).
|
||||
Otherwise, the filter will be a square.
|
||||
stride (int|tuple): The stride size. If stride is a tuple, it must
|
||||
contain two integers, (stride_H, stride_W). Otherwise, the
|
||||
stride_H = stride_W = stride. Default: stride = 1.
|
||||
padding (int|tuple): The padding size. If padding is a tuple, it must
|
||||
contain two integers, (padding_H, padding_W). Otherwise, the
|
||||
padding_H = padding_W = padding. Default: padding = 0.
|
||||
dilation (int|tuple): The dilation size. If dilation is a tuple, it must
|
||||
contain two integers, (dilation_H, dilation_W). Otherwise, the
|
||||
dilation_H = dilation_W = dilation. Default: dilation = 1.
|
||||
groups (int): The groups number of the Conv2d Layer. According to grouped
|
||||
convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
|
||||
the first half of the filters is only connected to the first half
|
||||
of the input channels, while the second half of the filters is only
|
||||
connected to the second half of the input channels. Default: groups=1.
|
||||
param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
|
||||
of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
|
||||
will create ParamAttr as param_attr. If the Initializer of the param_attr
|
||||
is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
|
||||
and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
|
||||
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
|
||||
If it is set to False, no bias will be added to the output units.
|
||||
If it is set to None or one attribute of ParamAttr, conv2d
|
||||
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
|
||||
is not set, the bias is initialized zero. Default: None.
|
||||
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
|
||||
library is installed. Default: True
|
||||
act (str): Activation type, if it is set to None, activation is not appended.
|
||||
Default: None
|
||||
|
||||
Raises:
|
||||
ValueError: If the shapes of input, filter_size, stride, padding and
|
||||
groups mismatch.
|
||||
|
||||
Examples:
|
||||
.. code-block:: python
|
||||
|
||||
from paddle.fluid.dygraph.base import to_variable
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.dygraph import Conv2D
|
||||
import numpy as np
|
||||
|
||||
data = np.random.uniform( -1, 1, [10, 3, 32, 32] ).astype('float32')
|
||||
with fluid.dygraph.guard():
|
||||
conv2d = Conv2D( "conv2d", 2, 3)
|
||||
data = to_variable( data )
|
||||
conv = conv2d( data )
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
name_scope,
|
||||
num_filters,
|
||||
filter_size,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=None,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
use_cudnn=True,
|
||||
act=None,
|
||||
epsilon=1e-30,
|
||||
dtype="float32"):
|
||||
assert param_attr is not False, "param_attr should not be False here."
|
||||
super(Conv2D, self).__init__(name_scope, dtype)
|
||||
self._groups = groups
|
||||
self._stride = utils.convert_to_list(stride, 2, "stride")
|
||||
self._padding = utils.convert_to_list(padding, 2, "padding")
|
||||
self._dilation = utils.convert_to_list(dilation, 2, "dilation")
|
||||
self._act = act
|
||||
if not isinstance(use_cudnn, bool):
|
||||
raise ValueError("use_cudnn should be True or False")
|
||||
self._use_cudnn = use_cudnn
|
||||
self._filter_size = filter_size
|
||||
self._num_filters = num_filters
|
||||
self._param_attr = param_attr
|
||||
self._bias_attr = bias_attr
|
||||
self._epsilon = epsilon
|
||||
self._dtype = dtype
|
||||
# if (self._num_channels == self._groups and
|
||||
# num_filters % self._num_channels == 0 and not self._use_cudnn):
|
||||
# self._l_type = 'depthwise_conv2d'
|
||||
# else:
|
||||
# TODO(jiabin): recover the usage of depthwise_conv2d when it's
|
||||
# kernel fixed https://github.com/PaddlePaddle/Paddle/issues/17275
|
||||
self._l_type = "conv2d"
|
||||
|
||||
def _build_once(self, input):
|
||||
self._num_channels = input.shape[1]
|
||||
if self._groups is None:
|
||||
num_filter_channels = self._num_channels
|
||||
else:
|
||||
if self._num_channels % self._groups != 0:
|
||||
raise ValueError("num_channels must be divisible by groups.")
|
||||
num_filter_channels = self._num_channels // self._groups
|
||||
filter_size = utils.convert_to_list(self._filter_size, 2, "filter_size")
|
||||
filter_shape = [self._num_filters, int(num_filter_channels)
|
||||
] + filter_size
|
||||
|
||||
def _get_default_param_initializer():
|
||||
filter_elem_num = filter_size[0] * filter_size[
|
||||
1] * self._num_channels
|
||||
std = (2.0 / filter_elem_num)**0.5
|
||||
return Normal(0.0, std, 0)
|
||||
|
||||
# weight_v
|
||||
self._filter_param_v = self.create_parameter(
|
||||
attr=self._param_attr,
|
||||
shape=filter_shape,
|
||||
dtype=self._dtype,
|
||||
default_initializer=_get_default_param_initializer())
|
||||
|
||||
# weight_g
|
||||
norm_value = _norm(
|
||||
self._filter_param_v.numpy(), dim=0) # CAUTION: hard-code
|
||||
self._filter_param_g = self.create_parameter(
|
||||
attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(
|
||||
norm_value)),
|
||||
shape=norm_value.shape,
|
||||
dtype=self._dtype,
|
||||
default_initializer=_get_default_param_initializer())
|
||||
|
||||
if self._use_cudnn:
|
||||
self.create_variable(
|
||||
name="kCUDNNFwdAlgoCache",
|
||||
persistable=True,
|
||||
type=core.VarDesc.VarType.RAW)
|
||||
self.create_variable(
|
||||
name="kCUDNNBwdDataAlgoCache",
|
||||
persistable=True,
|
||||
type=core.VarDesc.VarType.RAW)
|
||||
self.create_variable(
|
||||
name="kCUDNNBwdFilterAlgoCache",
|
||||
persistable=True,
|
||||
type=core.VarDesc.VarType.RAW)
|
||||
|
||||
self._bias_param = self.create_parameter(
|
||||
attr=self._bias_attr,
|
||||
shape=[self._num_filters],
|
||||
dtype=self._dtype,
|
||||
is_bias=True)
|
||||
|
||||
def forward(self, input):
|
||||
matrix = self._helper.create_variable_for_type_inference(self._dtype)
|
||||
tmp = self._helper.create_variable_for_type_inference(self._dtype)
|
||||
new_shape = [
|
||||
self._filter_param_v.shape[0],
|
||||
reduce(lambda x, y: x * y, self._filter_param_v.shape[1:], 1),
|
||||
]
|
||||
|
||||
self._helper.append_op(
|
||||
type="reshape2",
|
||||
inputs={"X": self._filter_param_v},
|
||||
attrs={"shape": new_shape},
|
||||
outputs={"Out": matrix,
|
||||
"XShape": tmp})
|
||||
|
||||
m_norm = self._helper.create_variable_for_type_inference(self._dtype)
|
||||
m_normalized = self._helper.create_variable_for_type_inference(
|
||||
self._dtype)
|
||||
self._helper.append_op(
|
||||
type="norm",
|
||||
inputs={"X": matrix},
|
||||
outputs={"Out": m_normalized,
|
||||
"Norm": m_norm},
|
||||
attrs={"axis": 1,
|
||||
"epsilon": self._epsilon})
|
||||
|
||||
v_normalized = self._helper.create_variable_for_type_inference(
|
||||
self._dtype)
|
||||
tmp2 = self._helper.create_variable_for_type_inference(self._dtype)
|
||||
self._helper.append_op(
|
||||
type="reshape2",
|
||||
inputs={"X": m_normalized},
|
||||
attrs={"shape": self._filter_param_v.shape},
|
||||
outputs={"Out": v_normalized,
|
||||
"XShape": tmp2})
|
||||
|
||||
filter_param = self._helper.create_variable_for_type_inference(
|
||||
self._dtype)
|
||||
self._helper.append_op(
|
||||
type="elementwise_mul",
|
||||
inputs={"X": [v_normalized],
|
||||
"Y": [self._filter_param_g]},
|
||||
outputs={"Out": [filter_param]},
|
||||
attrs={"axis": 0}, # CAUTION: hard-code
|
||||
)
|
||||
|
||||
pre_bias = self._helper.create_variable_for_type_inference(
|
||||
dtype=self._dtype)
|
||||
|
||||
self._helper.append_op(
|
||||
type=self._l_type,
|
||||
inputs={"Input": input,
|
||||
"Filter": filter_param},
|
||||
outputs={"Output": pre_bias},
|
||||
attrs={
|
||||
"strides": self._stride,
|
||||
"paddings": self._padding,
|
||||
"dilations": self._dilation,
|
||||
"groups": self._groups if self._groups else 1,
|
||||
"use_cudnn": self._use_cudnn,
|
||||
"use_mkldnn": False,
|
||||
})
|
||||
|
||||
if self._bias_param is not None:
|
||||
pre_act = self._helper.create_variable_for_type_inference(
|
||||
dtype=self._dtype)
|
||||
self._helper.append_op(
|
||||
type="elementwise_add",
|
||||
inputs={"X": [pre_bias],
|
||||
"Y": [self._bias_param]},
|
||||
outputs={"Out": [pre_act]},
|
||||
attrs={"axis": 1})
|
||||
else:
|
||||
pre_act = pre_bias
|
||||
|
||||
# Currently, we don't support inplace in dygraph mode
|
||||
return self._helper.append_activation(pre_act, act=self._act)
|
||||
|
||||
|
||||
class Conv2DTranspose(dg.Layer):
|
||||
"""
|
||||
**Convlution2D transpose layer**
|
||||
|
||||
The convolution2D transpose layer calculates the output based on the input,
|
||||
filter, and dilations, strides, paddings. Input(Input) and output(Output)
|
||||
are in NCHW format. Where N is batch size, C is the number of channels,
|
||||
H is the height of the feature, and W is the width of the feature.
|
||||
Parameters(dilations, strides, paddings) are two elements. These two elements
|
||||
represent height and width, respectively. The details of convolution transpose
|
||||
layer, please refer to the following explanation and references
|
||||
`therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
|
||||
If bias attribution and activation type are provided, bias is added to
|
||||
the output of the convolution, and the corresponding activation function
|
||||
is applied to the final result.
|
||||
|
||||
For each input :math:`X`, the equation is:
|
||||
|
||||
.. math::
|
||||
|
||||
Out = \sigma ((Vg) \\ast X + b)
|
||||
|
||||
Where:
|
||||
|
||||
* :math:`X`: Input value, a tensor with NCHW format.
|
||||
* :math:`V`: Filter value, a tensor with MCHW format.
|
||||
* :math:`g`: Filter value, a tensor with M format.
|
||||
* :math:`\\ast`: Convolution operation.
|
||||
* :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
|
||||
* :math:`\\sigma`: Activation function.
|
||||
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
|
||||
|
||||
Example:
|
||||
|
||||
- Input:
|
||||
|
||||
Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
|
||||
|
||||
Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
|
||||
|
||||
- Output:
|
||||
|
||||
Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
|
||||
|
||||
Where
|
||||
|
||||
.. math::
|
||||
|
||||
H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
|
||||
W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
|
||||
H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
|
||||
W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
|
||||
|
||||
Args:
|
||||
name_scope(str): The name of this class.
|
||||
num_filters(int): The number of the filter. It is as same as the output
|
||||
image channel.
|
||||
output_size(int|tuple|None): The output image size. If output size is a
|
||||
tuple, it must contain two integers, (image_H, image_W). None if use
|
||||
filter_size, padding, and stride to calculate output_size.
|
||||
if output_size and filter_size are specified at the same time, They
|
||||
should follow the formula above. Default: None.
|
||||
filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
|
||||
it must contain two integers, (filter_size_H, filter_size_W).
|
||||
Otherwise, the filter will be a square. None if use output size to
|
||||
calculate filter_size. Default: None.
|
||||
padding(int|tuple): The padding size. If padding is a tuple, it must
|
||||
contain two integers, (padding_H, padding_W). Otherwise, the
|
||||
padding_H = padding_W = padding. Default: padding = 0.
|
||||
stride(int|tuple): The stride size. If stride is a tuple, it must
|
||||
contain two integers, (stride_H, stride_W). Otherwise, the
|
||||
stride_H = stride_W = stride. Default: stride = 1.
|
||||
dilation(int|tuple): The dilation size. If dilation is a tuple, it must
|
||||
contain two integers, (dilation_H, dilation_W). Otherwise, the
|
||||
dilation_H = dilation_W = dilation. Default: dilation = 1.
|
||||
groups(int): The groups number of the Conv2d transpose layer. Inspired by
|
||||
grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
|
||||
when group=2, the first half of the filters is only connected to the
|
||||
first half of the input channels, while the second half of the
|
||||
filters is only connected to the second half of the input channels.
|
||||
Default: groups = 1.
|
||||
param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
|
||||
of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
|
||||
will create ParamAttr as param_attr. If the Initializer of the param_attr
|
||||
is not set, the parameter is initialized with Xavier. Default: None.
|
||||
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose.
|
||||
If it is set to False, no bias will be added to the output units.
|
||||
If it is set to None or one attribute of ParamAttr, conv2d_transpose
|
||||
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
|
||||
is not set, the bias is initialized zero. Default: None.
|
||||
use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
|
||||
library is installed. Default: True.
|
||||
act (str): Activation type, if it is set to None, activation is not appended.
|
||||
Default: None.
|
||||
|
||||
Returns:
|
||||
Variable: The tensor variable storing the convolution transpose result.
|
||||
|
||||
Raises:
|
||||
ValueError: If the shapes of input, filter_size, stride, padding and
|
||||
groups mismatch.
|
||||
|
||||
Examples:
|
||||
.. code-block:: python
|
||||
|
||||
import paddle.fluid as fluid
|
||||
import numpy
|
||||
|
||||
with fluid.dygraph.guard():
|
||||
data = numpy.random.random((3, 32, 32)).astype('float32')
|
||||
conv2DTranspose = fluid.dygraph.nn.Conv2DTranspose(
|
||||
'Conv2DTranspose', num_filters=2, filter_size=3)
|
||||
ret = conv2DTranspose(fluid.dygraph.base.to_variable(data))
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
name_scope,
|
||||
num_filters,
|
||||
output_size=None,
|
||||
filter_size=None,
|
||||
padding=0,
|
||||
stride=1,
|
||||
dilation=1,
|
||||
groups=None,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
use_cudnn=True,
|
||||
epsilon=1e-30,
|
||||
act=None,
|
||||
dtype="float32"):
|
||||
super(Conv2DTranspose, self).__init__(name_scope, dtype)
|
||||
assert (param_attr is not False
|
||||
), "param_attr should not be False in conv2d_transpose."
|
||||
self._param_attr = param_attr
|
||||
self._bias_attr = bias_attr
|
||||
self._groups = groups
|
||||
self._num_filters = num_filters
|
||||
self._use_cudnn = use_cudnn
|
||||
self._padding = padding
|
||||
self._stride = stride
|
||||
self._dilation = dilation
|
||||
self._filter_size = filter_size
|
||||
self._output_size = output_size
|
||||
self._op_type = "conv2d_transpose"
|
||||
self._epsilon = epsilon
|
||||
|
||||
def _build_once(self, input):
|
||||
input_channel = input.shape[1]
|
||||
if (input_channel == self._groups and
|
||||
self._num_filters == input_channel and not self._use_cudnn):
|
||||
self._op_type = "depthwise_conv2d_transpose"
|
||||
|
||||
if not isinstance(input, Variable):
|
||||
raise TypeError("Input of conv2d_transpose must be Variable")
|
||||
|
||||
self._padding = utils.convert_to_list(self._padding, 2, "padding")
|
||||
self._stride = utils.convert_to_list(self._stride, 2, "stride")
|
||||
self._dilation = utils.convert_to_list(self._dilation, 2, "dilation")
|
||||
|
||||
if not isinstance(self._use_cudnn, bool):
|
||||
raise ValueError("use_cudnn should be True or False")
|
||||
|
||||
if self._filter_size is None:
|
||||
if self._output_size is None:
|
||||
raise ValueError(
|
||||
"output_size must be set when filter_size is None")
|
||||
if isinstance(self._output_size, int):
|
||||
self._output_size = [self._output_size, self._output_size]
|
||||
|
||||
h_in = input.shape[2]
|
||||
w_in = input.shape[3]
|
||||
|
||||
filter_size_h = (self._output_size[0] -
|
||||
(h_in - 1) * self._stride[0] + 2 * self._padding[0]
|
||||
- 1) // self._dilation[0] + 1
|
||||
filter_size_w = (self._output_size[1] -
|
||||
(w_in - 1) * self._stride[1] + 2 * self._padding[1]
|
||||
- 1) // self._dilation[1] + 1
|
||||
self._filter_size = [filter_size_h, filter_size_w]
|
||||
else:
|
||||
self._filter_size = utils.convert_to_list(
|
||||
self._filter_size, 2, "conv2d_transpose.filter_size")
|
||||
|
||||
if self._output_size is None:
|
||||
self._output_size = []
|
||||
elif isinstance(self._output_size, list) or isinstance(
|
||||
self._output_size, int):
|
||||
self._output_size = utils.convert_to_list(self._output_size, 2,
|
||||
"output_size")
|
||||
else:
|
||||
raise ValueError("output_size should be list or int")
|
||||
self._padding = utils.convert_to_list(self._padding, 2, "padding")
|
||||
self._groups = 1 if self._groups is None else self._groups
|
||||
filter_shape = [
|
||||
input_channel,
|
||||
self._num_filters // self._groups,
|
||||
] + self._filter_size
|
||||
|
||||
# img filter v (direction)
|
||||
self._img_filter_v = self.create_parameter(
|
||||
dtype=input.dtype, shape=filter_shape, attr=self._param_attr)
|
||||
|
||||
# img filter g (magnitude)
|
||||
img_filter_magnitude = _norm(
|
||||
self._img_filter_v.numpy(), dim=0) # CAUTION: hard-code
|
||||
self._img_filter_g = self.create_parameter(
|
||||
dtype=input.dtype,
|
||||
shape=img_filter_magnitude.shape,
|
||||
attr=fluid.ParamAttr(
|
||||
initializer=NumpyArrayInitializer(img_filter_magnitude)))
|
||||
|
||||
self._img_bias = self.create_parameter(
|
||||
attr=self._bias_attr,
|
||||
shape=[self._num_filters],
|
||||
dtype=self._dtype,
|
||||
is_bias=True)
|
||||
|
||||
def forward(self, input):
|
||||
matrix = self._helper.create_variable_for_type_inference(self._dtype)
|
||||
tmp = self._helper.create_variable_for_type_inference(self._dtype)
|
||||
new_shape = [
|
||||
self._img_filter_v.shape[0],
|
||||
reduce(lambda x, y: x * y, self._img_filter_v.shape[1:], 1),
|
||||
]
|
||||
|
||||
self._helper.append_op(
|
||||
type="reshape2",
|
||||
inputs={"X": self._img_filter_v},
|
||||
attrs={"shape": new_shape},
|
||||
outputs={"Out": matrix,
|
||||
"XShape": tmp})
|
||||
|
||||
m_norm = self._helper.create_variable_for_type_inference(self._dtype)
|
||||
m_normalized = self._helper.create_variable_for_type_inference(
|
||||
self._dtype)
|
||||
self._helper.append_op(
|
||||
type="norm",
|
||||
inputs={"X": matrix},
|
||||
outputs={"Out": m_normalized,
|
||||
"Norm": m_norm},
|
||||
attrs={"axis": 1,
|
||||
"epsilon": self._epsilon})
|
||||
|
||||
v_normalized = self._helper.create_variable_for_type_inference(
|
||||
self._dtype)
|
||||
tmp2 = self._helper.create_variable_for_type_inference(self._dtype)
|
||||
self._helper.append_op(
|
||||
type="reshape2",
|
||||
inputs={"X": m_normalized},
|
||||
attrs={"shape": self._img_filter_v.shape},
|
||||
outputs={"Out": v_normalized,
|
||||
"XShape": tmp2})
|
||||
|
||||
img_filter = self._helper.create_variable_for_type_inference(
|
||||
self._dtype)
|
||||
self._helper.append_op(
|
||||
type="elementwise_mul",
|
||||
inputs={"X": [v_normalized],
|
||||
"Y": [self._img_filter_g]},
|
||||
outputs={"Out": [img_filter]},
|
||||
attrs={"axis": 0}, # CAUTION: hard-code
|
||||
)
|
||||
|
||||
pre_bias = self._helper.create_variable_for_type_inference(
|
||||
dtype=input.dtype)
|
||||
self._helper.append_op(
|
||||
type=self._op_type,
|
||||
inputs={"Input": [input],
|
||||
"Filter": [img_filter]},
|
||||
outputs={"Output": pre_bias},
|
||||
attrs={
|
||||
"output_size": self._output_size,
|
||||
"strides": self._stride,
|
||||
"paddings": self._padding,
|
||||
"dilations": self._dilation,
|
||||
"groups": self._groups,
|
||||
"use_cudnn": self._use_cudnn,
|
||||
})
|
||||
|
||||
if self._img_bias is not None:
|
||||
pre_act = self._helper.create_variable_for_type_inference(
|
||||
dtype=self._dtype)
|
||||
self._helper.append_op(
|
||||
type="elementwise_add",
|
||||
inputs={"X": [pre_bias],
|
||||
"Y": [self._img_bias]},
|
||||
outputs={"Out": [pre_act]},
|
||||
attrs={"axis": 1})
|
||||
else:
|
||||
pre_act = pre_bias
|
||||
|
||||
out = self._helper.append_activation(pre_act)
|
||||
return out
|
Loading…
Reference in New Issue