From fe4b4710363df083f005c9a828adbb3a611514e2 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 20 Nov 2019 19:38:00 +0800 Subject: [PATCH] use g2p in deepvoice3 --- deepvoice3/compute_timestamp_ratio.py | 2 +- deepvoice3/data.py | 2 +- deepvoice3/dry_run.py | 2 +- deepvoice3/eval_model.py | 2 +- deepvoice3/synthesis.py | 2 +- deepvoice3/train.py | 2 +- modules/__init__.py | 0 modules/conv.py | 222 ------- modules/frontend/README.md | 1 - modules/frontend/__init__.py | 33 - modules/frontend/en/__init__.py | 35 -- modules/frontend/es/__init__.py | 16 - modules/frontend/jp/__init__.py | 77 --- modules/frontend/ko/__init__.py | 17 - modules/frontend/text/__init__.py | 74 --- modules/frontend/text/cleaners.py | 104 ---- modules/frontend/text/cmudict.py | 67 -- modules/frontend/text/numbers.py | 71 --- modules/frontend/text/symbols.py | 18 - modules/loss.py | 158 ----- modules/modules.py | 458 -------------- modules/weight_norm.py | 863 -------------------------- 22 files changed, 6 insertions(+), 2220 deletions(-) delete mode 100644 modules/__init__.py delete mode 100644 modules/conv.py delete mode 100644 modules/frontend/README.md delete mode 100644 modules/frontend/__init__.py delete mode 100644 modules/frontend/en/__init__.py delete mode 100644 modules/frontend/es/__init__.py delete mode 100644 modules/frontend/jp/__init__.py delete mode 100644 modules/frontend/ko/__init__.py delete mode 100644 modules/frontend/text/__init__.py delete mode 100644 modules/frontend/text/cleaners.py delete mode 100644 modules/frontend/text/cmudict.py delete mode 100644 modules/frontend/text/numbers.py delete mode 100644 modules/frontend/text/symbols.py delete mode 100644 modules/loss.py delete mode 100644 modules/modules.py delete mode 100644 modules/weight_norm.py diff --git a/deepvoice3/compute_timestamp_ratio.py b/deepvoice3/compute_timestamp_ratio.py index 7fc306f..d737059 100644 --- a/deepvoice3/compute_timestamp_ratio.py +++ b/deepvoice3/compute_timestamp_ratio.py @@ -14,7 +14,7 @@ from hparams import hparams, hparams_debug_string from data.data import TextDataSource, MelSpecDataSource from nnmnkwii.datasets import FileSourceDataset from tqdm import trange -from modules import frontend +import g2p as frontend def build_parser(): diff --git a/deepvoice3/data.py b/deepvoice3/data.py index 197b1f3..6c9c85a 100644 --- a/deepvoice3/data.py +++ b/deepvoice3/data.py @@ -25,7 +25,7 @@ import random # import global hyper parameters from hparams import hparams -from modules import frontend +import g2p as frontend import builder _frontend = getattr(frontend, hparams.frontend) diff --git a/deepvoice3/dry_run.py b/deepvoice3/dry_run.py index 4c1366a..4428e40 100644 --- a/deepvoice3/dry_run.py +++ b/deepvoice3/dry_run.py @@ -17,7 +17,7 @@ from paddle import fluid import paddle.fluid.dygraph as dg from hparams import hparams, hparams_debug_string -from modules import frontend +import g2p as frontend from deepvoice3 import DeepVoiceTTS diff --git a/deepvoice3/eval_model.py b/deepvoice3/eval_model.py index 870fdd6..101d352 100644 --- a/deepvoice3/eval_model.py +++ b/deepvoice3/eval_model.py @@ -37,7 +37,7 @@ from tensorboardX import SummaryWriter # import global hyper parameters from hparams import hparams -from modules import frontend +import g2p as frontend _frontend = getattr(frontend, hparams.frontend) diff --git a/deepvoice3/synthesis.py b/deepvoice3/synthesis.py index e043403..e589c57 100644 --- a/deepvoice3/synthesis.py +++ b/deepvoice3/synthesis.py @@ -30,7 +30,7 @@ import paddle.fluid.dygraph as dg sys.path.append("../") import audio -from modules import frontend +import g2p as frontend import dry_run from hparams import hparams diff --git a/deepvoice3/train.py b/deepvoice3/train.py index f36ade4..0f6859c 100644 --- a/deepvoice3/train.py +++ b/deepvoice3/train.py @@ -32,7 +32,7 @@ from data import (TextDataSource, MelSpecDataSource, LinearSpecDataSource, PartialyRandomizedSimilarTimeLengthSampler, Dataset, make_loader, create_batch) -from modules import frontend +import g2p as frontend from builder import deepvoice3, WindowRange from dry_run import dry_run from train_model import train_model diff --git a/modules/__init__.py b/modules/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/modules/conv.py b/modules/conv.py deleted file mode 100644 index 34149be..0000000 --- a/modules/conv.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import numpy as np - -import paddle -from paddle import fluid -import paddle.fluid.dygraph as dg - -from weight_norm import Conv2D, Conv2DTranspose - - -class Conv1D(dg.Layer): - """ - A convolution 1D block implemented with Conv2D. Form simplicity and - ensuring the output has the same length as the input, it does not allow - stride > 1. - """ - - def __init__(self, - name_scope, - in_cahnnels, - num_filters, - filter_size=3, - dilation=1, - groups=None, - causal=False, - param_attr=None, - bias_attr=None, - use_cudnn=True, - act=None, - dtype="float32"): - super(Conv1D, self).__init__(name_scope, dtype=dtype) - - if causal: - padding = dilation * (filter_size - 1) - else: - padding = (dilation * (filter_size - 1)) // 2 - - self.in_channels = in_cahnnels - self.num_filters = num_filters - self.filter_size = filter_size - self.dilation = dilation - self.causal = causal - self.padding = padding - self.act = act - - self.conv = Conv2D( - self.full_name(), - num_filters=num_filters, - filter_size=(1, filter_size), - stride=(1, 1), - dilation=(1, dilation), - padding=(0, padding), - groups=groups, - param_attr=param_attr, - bias_attr=bias_attr, - use_cudnn=use_cudnn, - act=act, - dtype=dtype) - - def forward(self, x): - """ - Args: - x (Variable): Shape(B, C_in, 1, T), the input, where C_in means - input channels. - - Returns: - x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means - output channels (num_filters). - """ - x = self.conv(x) - if self.filter_size > 1: - if self.causal: - x = fluid.layers.slice( - x, axes=[3], starts=[0], ends=[-self.padding]) - elif self.filter_size % 2 == 0: - x = fluid.layers.slice(x, axes=[3], starts=[0], ends=[-1]) - return x - - def start_new_sequence(self): - self.temp_weight = None - self.input_buffer = None - - def add_input(self, x): - """ - Adding input for a time step and compute an output for a time step. - - Args: - x (Variable): Shape(B, C_in, 1, T), the input, where C_in means - input channels, and T = 1. - - Returns: - out (Variable): Shape(B, C_out, 1, T), the outputs, where C_out - means output channels (num_filters), and T = 1. - - """ - if self.temp_weight is None: - self.temp_weight = self._reshaped_weight() - - window_size = 1 + (self.filter_size - 1) * self.dilation - batch_size = x.shape[0] - in_channels = x.shape[1] - - if self.filter_size > 1: - if self.input_buffer is None: - self.input_buffer = fluid.layers.fill_constant( - [batch_size, in_channels, 1, window_size - 1], - dtype=x.dtype, - value=0.0) - else: - self.input_buffer = self.input_buffer[:, :, :, 1:] - self.input_buffer = fluid.layers.concat( - [self.input_buffer, x], axis=3) - x = self.input_buffer - if self.dilation > 1: - if not hasattr(self, "indices"): - self.indices = dg.to_variable( - np.arange(0, window_size, self.dilation)) - tmp = fluid.layers.transpose( - self.input_buffer, perm=[3, 1, 2, 0]) - tmp = fluid.layers.gather(tmp, index=self.indices) - tmp = fluid.layers.transpose(tmp, perm=[3, 1, 2, 0]) - x = tmp - inputs = fluid.layers.reshape( - x, shape=[batch_size, in_channels * 1 * self.filter_size]) - out = fluid.layers.matmul(inputs, self.temp_weight, transpose_y=True) - out = fluid.layers.elementwise_add(out, self.conv._bias_param, axis=-1) - out = fluid.layers.reshape(out, out.shape + [1, 1]) - out = self._helper.append_activation(out, act=self.act) - return out - - def _reshaped_weight(self): - """ - Get the linearized weight of convolution filter, cause it is by nature - a matmul weight. And because the model uses weight norm, compute the - weight by weight_v * weight_g to make it faster. - - Returns: - weight_matrix (Variable): Shape(C_out, C_in * 1 * kernel_size) - """ - shape = self.conv._filter_param_v.shape - matrix_shape = [shape[0], np.prod(shape[1:])] - weight_matrix = fluid.layers.reshape( - self.conv._filter_param_v, shape=matrix_shape) - weight_matrix = fluid.layers.elementwise_mul( - fluid.layers.l2_normalize( - weight_matrix, axis=1), - self.conv._filter_param_g, - axis=0) - return weight_matrix - - -class Conv1DTranspose(dg.Layer): - """ - A convolutional transpose 1D block implemented with convolutional transpose - 2D. It does not ensure that the output is exactly expanded stride times in - time dimension. - """ - - def __init__(self, - name_scope, - in_channels, - num_filters, - filter_size, - padding=0, - stride=1, - dilation=1, - groups=None, - param_attr=None, - bias_attr=None, - use_cudnn=True, - act=None, - dtype="float32"): - super(Conv1DTranspose, self).__init__(name_scope, dtype=dtype) - - self.in_channels = in_channels - self.num_filters = num_filters - self.filter_size = filter_size - self.padding = padding - self.stride = stride - self.dilation = dilation - self.groups = groups - - self.conv_transpose = Conv2DTranspose( - self.full_name(), - num_filters, - filter_size=(1, filter_size), - padding=(0, padding), - stride=(1, stride), - dilation=(1, dilation), - groups=groups, - param_attr=param_attr, - bias_attr=bias_attr, - use_cudnn=use_cudnn, - act=act, - dtype=dtype) - - def forward(self, x): - """ - Argss: - x (Variable): Shape(B, C_in, 1, T_in), where C_in means the input - channels and T_in means the number of time steps of input. - - Returns: - out (Variable): shape(B, C_out, 1, T_out), where C_out means the - output channels and T_out means the number of time steps of - input. - """ - return self.conv_transpose(x) diff --git a/modules/frontend/README.md b/modules/frontend/README.md deleted file mode 100644 index af4513e..0000000 --- a/modules/frontend/README.md +++ /dev/null @@ -1 +0,0 @@ -This package is adapted from https://github.com/r9y9/deepvoice3_pytorch/tree/master/deepvoice3_pytorch/frontend, Copyright (c) 2017: Ryuichi Yamamoto, whose license applies. diff --git a/modules/frontend/__init__.py b/modules/frontend/__init__.py deleted file mode 100644 index f5f6c0a..0000000 --- a/modules/frontend/__init__.py +++ /dev/null @@ -1,33 +0,0 @@ -# coding: utf-8 -"""Text processing frontend - -All frontend module should have the following functions: - -- text_to_sequence(text, p) -- sequence_to_text(sequence) - -and the property: - -- n_vocab - -""" -from . import en - -# optinoal Japanese frontend -try: - from . import jp -except ImportError: - jp = None - -try: - from . import ko -except ImportError: - ko = None - -# if you are going to use the frontend, you need to modify _characters in -# symbol.py: -# _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' + '¡¿ñáéíóúÁÉÍÓÚÑ' -try: - from . import es -except ImportError: - es = None diff --git a/modules/frontend/en/__init__.py b/modules/frontend/en/__init__.py deleted file mode 100644 index 58cf2f6..0000000 --- a/modules/frontend/en/__init__.py +++ /dev/null @@ -1,35 +0,0 @@ -# coding: utf-8 -from modules.frontend.text.symbols import symbols - -import nltk -from random import random - -n_vocab = len(symbols) - -_arpabet = nltk.corpus.cmudict.dict() - - -def _maybe_get_arpabet(word, p): - try: - phonemes = _arpabet[word][0] - phonemes = " ".join(phonemes) - except KeyError: - return word - - return '{%s}' % phonemes if random() < p else word - - -def mix_pronunciation(text, p): - text = ' '.join(_maybe_get_arpabet(word, p) for word in text.split(' ')) - return text - - -def text_to_sequence(text, p=0.0): - if p >= 0: - text = mix_pronunciation(text, p) - from modules.frontend.text import text_to_sequence - text = text_to_sequence(text, ["english_cleaners"]) - return text - - -from modules.frontend.text import sequence_to_text diff --git a/modules/frontend/es/__init__.py b/modules/frontend/es/__init__.py deleted file mode 100644 index 24323e5..0000000 --- a/modules/frontend/es/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# coding: utf-8 -from deepvoice3_paddle.frontend.text.symbols import symbols - -import nltk -from random import random - -n_vocab = len(symbols) - - -def text_to_sequence(text, p=0.0): - from deepvoice3_paddle.frontend.text import text_to_sequence - text = text_to_sequence(text, ["basic_cleaners"]) - return text - - -from deepvoice3_paddle.frontend.text import sequence_to_text diff --git a/modules/frontend/jp/__init__.py b/modules/frontend/jp/__init__.py deleted file mode 100644 index 36c7fd8..0000000 --- a/modules/frontend/jp/__init__.py +++ /dev/null @@ -1,77 +0,0 @@ -# coding: utf-8 - -import MeCab -import jaconv -from random import random - -n_vocab = 0xffff - -_eos = 1 -_pad = 0 -_tagger = None - - -def _yomi(mecab_result): - tokens = [] - yomis = [] - for line in mecab_result.split("\n")[:-1]: - s = line.split("\t") - if len(s) == 1: - break - token, rest = s - rest = rest.split(",") - tokens.append(token) - yomi = rest[7] if len(rest) > 7 else None - yomi = None if yomi == "*" else yomi - yomis.append(yomi) - - return tokens, yomis - - -def _mix_pronunciation(tokens, yomis, p): - return "".join(yomis[idx] - if yomis[idx] is not None and random() < p else tokens[idx] - for idx in range(len(tokens))) - - -def mix_pronunciation(text, p): - global _tagger - if _tagger is None: - _tagger = MeCab.Tagger("") - tokens, yomis = _yomi(_tagger.parse(text)) - return _mix_pronunciation(tokens, yomis, p) - - -def add_punctuation(text): - last = text[-1] - if last not in [".", ",", "、", "。", "!", "?", "!", "?"]: - text = text + "。" - return text - - -def normalize_delimitor(text): - text = text.replace(",", "、") - text = text.replace(".", "。") - text = text.replace(",", "、") - text = text.replace(".", "。") - return text - - -def text_to_sequence(text, p=0.0): - for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】", "(", ")", "(", ")"]: - text = text.replace(c, "") - text = text.replace("!", "!") - text = text.replace("?", "?") - - text = normalize_delimitor(text) - text = jaconv.normalize(text) - if p > 0: - text = mix_pronunciation(text, p) - text = jaconv.hira2kata(text) - text = add_punctuation(text) - - return [ord(c) for c in text] + [_eos] # EOS - - -def sequence_to_text(seq): - return "".join(chr(n) for n in seq) diff --git a/modules/frontend/ko/__init__.py b/modules/frontend/ko/__init__.py deleted file mode 100644 index ccb8b5f..0000000 --- a/modules/frontend/ko/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# coding: utf-8 - -from random import random - -n_vocab = 0xffff - -_eos = 1 -_pad = 0 -_tagger = None - - -def text_to_sequence(text, p=0.0): - return [ord(c) for c in text] + [_eos] # EOS - - -def sequence_to_text(seq): - return "".join(chr(n) for n in seq) diff --git a/modules/frontend/text/__init__.py b/modules/frontend/text/__init__.py deleted file mode 100644 index 26244ce..0000000 --- a/modules/frontend/text/__init__.py +++ /dev/null @@ -1,74 +0,0 @@ -import re -from . import cleaners -from .symbols import symbols - -# Mappings from symbol to numeric ID and vice versa: -_symbol_to_id = {s: i for i, s in enumerate(symbols)} -_id_to_symbol = {i: s for i, s in enumerate(symbols)} - -# Regular expression matching text enclosed in curly braces: -_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') - - -def text_to_sequence(text, cleaner_names): - '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. - - The text can optionally have ARPAbet sequences enclosed in curly braces embedded - in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." - - Args: - text: string to convert to a sequence - cleaner_names: names of the cleaner functions to run the text through - - Returns: - List of integers corresponding to the symbols in the text - ''' - sequence = [] - - # Check for curly braces and treat their contents as ARPAbet: - while len(text): - m = _curly_re.match(text) - if not m: - sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) - break - sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) - sequence += _arpabet_to_sequence(m.group(2)) - text = m.group(3) - - # Append EOS token - sequence.append(_symbol_to_id['~']) - return sequence - - -def sequence_to_text(sequence): - '''Converts a sequence of IDs back to a string''' - result = '' - for symbol_id in sequence: - if symbol_id in _id_to_symbol: - s = _id_to_symbol[symbol_id] - # Enclose ARPAbet back in curly braces: - if len(s) > 1 and s[0] == '@': - s = '{%s}' % s[1:] - result += s - return result.replace('}{', ' ') - - -def _clean_text(text, cleaner_names): - for name in cleaner_names: - cleaner = getattr(cleaners, name) - if not cleaner: - raise Exception('Unknown cleaner: %s' % name) - text = cleaner(text) - return text - - -def _symbols_to_sequence(symbols): - return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] - - -def _arpabet_to_sequence(text): - return _symbols_to_sequence(['@' + s for s in text.split()]) - - -def _should_keep_symbol(s): - return s in _symbol_to_id and s is not '_' and s is not '~' diff --git a/modules/frontend/text/cleaners.py b/modules/frontend/text/cleaners.py deleted file mode 100644 index e942264..0000000 --- a/modules/frontend/text/cleaners.py +++ /dev/null @@ -1,104 +0,0 @@ -''' -Cleaners are transformations that run over the input text at both training and -eval time. - -Cleaners can be selected by passing a comma-delimited list of cleaner names as -the "cleaners" hyperparameter. Some cleaners are English-specific. You'll -typically want to use: - 1. "english_cleaners" for English text - 2. "transliteration_cleaners" for non-English text that can be transliterated - to ASCII using the Unidecode library (https://pypi.python.org/pypi/Unidecode) - 3. "basic_cleaners" if you do not want to transliterate (in this case, you - should also update the symbols in symbols.py to match your data). -''' - -import re -from unidecode import unidecode -from .numbers import normalize_numbers - -# Regular expression matching whitespace: -_whitespace_re = re.compile(r'\s+') - -# List of (regular expression, replacement) pairs for abbreviations: -_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) - for x in [ - ('mrs', 'misess'), - ('mr', 'mister'), - ('dr', 'doctor'), - ('st', 'saint'), - ('co', 'company'), - ('jr', 'junior'), - ('maj', 'major'), - ('gen', 'general'), - ('drs', 'doctors'), - ('rev', 'reverend'), - ('lt', 'lieutenant'), - ('hon', 'honorable'), - ('sgt', 'sergeant'), - ('capt', 'captain'), - ('esq', 'esquire'), - ('ltd', 'limited'), - ('col', 'colonel'), - ('ft', 'fort'), - ]] - - -def expand_abbreviations(text): - for regex, replacement in _abbreviations: - text = re.sub(regex, replacement, text) - return text - - -def expand_numbers(text): - return normalize_numbers(text) - - -def lowercase(text): - return text.lower() - - -def collapse_whitespace(text): - return re.sub(_whitespace_re, ' ', text) - - -def convert_to_ascii(text): - return unidecode(text) - - -def add_punctuation(text): - if len(text) == 0: - return text - if text[-1] not in '!,.:;?': - text = text + '.' # without this decoder is confused when to output EOS - return text - - -def basic_cleaners(text): - ''' - Basic pipeline that lowercases and collapses whitespace without - transliteration. - ''' - text = lowercase(text) - text = collapse_whitespace(text) - return text - - -def transliteration_cleaners(text): - '''Pipeline for non-English text that transliterates to ASCII.''' - text = convert_to_ascii(text) - text = lowercase(text) - text = collapse_whitespace(text) - return text - - -def english_cleaners(text): - ''' - Pipeline for English text, including number and abbreviation expansion. - ''' - text = convert_to_ascii(text) - text = add_punctuation(text) - text = lowercase(text) - text = expand_numbers(text) - text = expand_abbreviations(text) - text = collapse_whitespace(text) - return text diff --git a/modules/frontend/text/cmudict.py b/modules/frontend/text/cmudict.py deleted file mode 100644 index 304592b..0000000 --- a/modules/frontend/text/cmudict.py +++ /dev/null @@ -1,67 +0,0 @@ -import re - -valid_symbols = [ - 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', - 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', - 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', - 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', - 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', - 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', - 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', - 'Z', 'ZH' -] - -_valid_symbol_set = set(valid_symbols) - - -class CMUDict: - ''' - Thin wrapper around CMUDict data. - http://www.speech.cs.cmu.edu/cgi-bin/cmudict - ''' - - def __init__(self, file_or_path, keep_ambiguous=True): - if isinstance(file_or_path, str): - with open(file_or_path, encoding='latin-1') as f: - entries = _parse_cmudict(f) - else: - entries = _parse_cmudict(file_or_path) - if not keep_ambiguous: - entries = { - word: pron - for word, pron in entries.items() if len(pron) == 1 - } - self._entries = entries - - def __len__(self): - return len(self._entries) - - def lookup(self, word): - '''Returns list of ARPAbet pronunciations of the given word.''' - return self._entries.get(word.upper()) - - -_alt_re = re.compile(r'\([0-9]+\)') - - -def _parse_cmudict(file): - cmudict = {} - for line in file: - if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): - parts = line.split(' ') - word = re.sub(_alt_re, '', parts[0]) - pronunciation = _get_pronunciation(parts[1]) - if pronunciation: - if word in cmudict: - cmudict[word].append(pronunciation) - else: - cmudict[word] = [pronunciation] - return cmudict - - -def _get_pronunciation(s): - parts = s.strip().split(' ') - for part in parts: - if part not in _valid_symbol_set: - return None - return ' '.join(parts) diff --git a/modules/frontend/text/numbers.py b/modules/frontend/text/numbers.py deleted file mode 100644 index 24b5817..0000000 --- a/modules/frontend/text/numbers.py +++ /dev/null @@ -1,71 +0,0 @@ -# -*- coding: utf-8 -*- - -import inflect -import re - -_inflect = inflect.engine() -_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') -_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') -_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') -_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') -_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') -_number_re = re.compile(r'[0-9]+') - - -def _remove_commas(m): - return m.group(1).replace(',', '') - - -def _expand_decimal_point(m): - return m.group(1).replace('.', ' point ') - - -def _expand_dollars(m): - match = m.group(1) - parts = match.split('.') - if len(parts) > 2: - return match + ' dollars' # Unexpected format - dollars = int(parts[0]) if parts[0] else 0 - cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 - if dollars and cents: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) - elif dollars: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - return '%s %s' % (dollars, dollar_unit) - elif cents: - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s' % (cents, cent_unit) - else: - return 'zero dollars' - - -def _expand_ordinal(m): - return _inflect.number_to_words(m.group(0)) - - -def _expand_number(m): - num = int(m.group(0)) - if num > 1000 and num < 3000: - if num == 2000: - return 'two thousand' - elif num > 2000 and num < 2010: - return 'two thousand ' + _inflect.number_to_words(num % 100) - elif num % 100 == 0: - return _inflect.number_to_words(num // 100) + ' hundred' - else: - return _inflect.number_to_words( - num, andword='', zero='oh', group=2).replace(', ', ' ') - else: - return _inflect.number_to_words(num, andword='') - - -def normalize_numbers(text): - text = re.sub(_comma_number_re, _remove_commas, text) - text = re.sub(_pounds_re, r'\1 pounds', text) - text = re.sub(_dollars_re, _expand_dollars, text) - text = re.sub(_decimal_number_re, _expand_decimal_point, text) - text = re.sub(_ordinal_re, _expand_ordinal, text) - text = re.sub(_number_re, _expand_number, text) - return text diff --git a/modules/frontend/text/symbols.py b/modules/frontend/text/symbols.py deleted file mode 100644 index c6fc28b..0000000 --- a/modules/frontend/text/symbols.py +++ /dev/null @@ -1,18 +0,0 @@ -''' -Defines the set of symbols used in text input to the model. - -The default is a set of ASCII characters that works well for English or text -that has been run through Unidecode. For other data, you can modify _characters. -See TRAINING_DATA.md for details. -''' -from .cmudict import valid_symbols - -_pad = '_' -_eos = '~' -_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' - -# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): -_arpabet = ['@' + s for s in valid_symbols] - -# Export all symbols: -symbols = [_pad, _eos] + list(_characters) + _arpabet diff --git a/modules/loss.py b/modules/loss.py deleted file mode 100644 index 96bcd3b..0000000 --- a/modules/loss.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -from numba import jit - -from paddle import fluid -import paddle.fluid.dygraph as dg - - -def masked_mean(inputs, mask): - """ - Args: - inputs (Variable): Shape(B, C, 1, T), the input, where B means - batch size, C means channels of input, T means timesteps of - the input. - mask (Variable): Shape(B, T), a mask. - Returns: - loss (Variable): Shape(1, ), masked mean. - """ - channels = inputs.shape[1] - reshaped_mask = fluid.layers.reshape( - mask, shape=[mask.shape[0], 1, 1, mask.shape[-1]]) - expanded_mask = fluid.layers.expand( - reshaped_mask, expand_times=[1, channels, 1, 1]) - expanded_mask.stop_gradient = True - - valid_cnt = fluid.layers.reduce_sum(expanded_mask) - valid_cnt.stop_gradient = True - - masked_inputs = inputs * expanded_mask - loss = fluid.layers.reduce_sum(masked_inputs) / valid_cnt - return loss - - -@jit(nopython=True) -def guided_attention(N, max_N, T, max_T, g): - W = np.zeros((max_N, max_T), dtype=np.float32) - for n in range(N): - for t in range(T): - W[n, t] = 1 - np.exp(-(n / N - t / T)**2 / (2 * g * g)) - return W - - -def guided_attentions(input_lengths, target_lengths, max_target_len, g=0.2): - B = len(input_lengths) - max_input_len = input_lengths.max() - W = np.zeros((B, max_target_len, max_input_len), dtype=np.float32) - for b in range(B): - W[b] = guided_attention(input_lengths[b], max_input_len, - target_lengths[b], max_target_len, g).T - return W - - -class TTSLoss(object): - def __init__(self, - masked_weight=0.0, - priority_weight=0.0, - binary_divergence_weight=0.0, - guided_attention_sigma=0.2): - self.masked_weight = masked_weight - self.priority_weight = priority_weight - self.binary_divergence_weight = binary_divergence_weight - self.guided_attention_sigma = guided_attention_sigma - - def l1_loss(self, prediction, target, mask, priority_bin=None): - abs_diff = fluid.layers.abs(prediction - target) - - # basic mask-weighted l1 loss - w = self.masked_weight - if w > 0 and mask is not None: - base_l1_loss = w * masked_mean(abs_diff, mask) + ( - 1 - w) * fluid.layers.reduce_mean(abs_diff) - else: - base_l1_loss = fluid.layers.reduce_mean(abs_diff) - - if self.priority_weight > 0 and priority_bin is not None: - # mask-weighted priority channels' l1-loss - priority_abs_diff = fluid.layers.slice( - abs_diff, axes=[1], starts=[0], ends=[priority_bin]) - if w > 0 and mask is not None: - priority_loss = w * masked_mean(priority_abs_diff, mask) + ( - 1 - w) * fluid.layers.reduce_mean(priority_abs_diff) - else: - priority_loss = fluid.layers.reduce_mean(priority_abs_diff) - - # priority weighted sum - p = self.priority_weight - loss = p * priority_loss + (1 - p) * base_l1_loss - else: - loss = base_l1_loss - return loss - - def binary_divergence(self, prediction, target, mask): - flattened_prediction = fluid.layers.reshape(prediction, [-1, 1]) - flattened_target = fluid.layers.reshape(target, [-1, 1]) - flattened_loss = fluid.layers.log_loss( - flattened_prediction, flattened_target, epsilon=1e-8) - bin_div = fluid.layers.reshape(flattened_loss, prediction.shape) - - w = self.masked_weight - if w > 0 and mask is not None: - loss = w * masked_mean(bin_div, mask) + ( - 1 - w) * fluid.layers.reduce_mean(bin_div) - else: - loss = fluid.layers.reduce_mean(bin_div) - return loss - - @staticmethod - def done_loss(done_hat, done): - flat_done_hat = fluid.layers.reshape(done_hat, [-1, 1]) - flat_done = fluid.layers.reshape(done, [-1, 1]) - loss = fluid.layers.log_loss(flat_done_hat, flat_done, epsilon=1e-8) - loss = fluid.layers.reduce_mean(loss) - return loss - - def attention_loss(self, predicted_attention, input_lengths, - target_lengths): - """ - Given valid encoder_lengths and decoder_lengths, compute a diagonal - guide, and compute loss from the predicted attention and the guide. - - Args: - predicted_attention (Variable): Shape(*, B, T_dec, T_enc), the - alignment tensor, where B means batch size, T_dec means number - of time steps of the decoder, T_enc means the number of time - steps of the encoder, * means other possible dimensions. - input_lengths (numpy.ndarray): Shape(B,), dtype:int64, valid lengths - (time steps) of encoder outputs. - target_lengths (numpy.ndarray): Shape(batch_size,), dtype:int64, - valid lengths (time steps) of decoder outputs. - - Returns: - loss (Variable): Shape(1, ) attention loss. - """ - n_attention, batch_size, max_target_len, max_input_len = ( - predicted_attention.shape) - soft_mask = guided_attentions(input_lengths, target_lengths, - max_target_len, - self.guided_attention_sigma) - soft_mask_ = dg.to_variable(soft_mask) - loss = fluid.layers.reduce_mean(predicted_attention * soft_mask_) - return loss diff --git a/modules/modules.py b/modules/modules.py deleted file mode 100644 index 3ae95d7..0000000 --- a/modules/modules.py +++ /dev/null @@ -1,458 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -from paddle import fluid -import paddle.fluid.dygraph as dg - -import numpy as np - -import conv -import weight_norm as weight_norm - - -def FC(name_scope, - in_features, - size, - num_flatten_dims=1, - dropout=0.0, - epsilon=1e-30, - act=None, - is_test=False, - dtype="float32"): - """ - A special Linear Layer, when it is used with dropout, the weight is - initialized as normal(0, std=np.sqrt((1-dropout) / in_features)) - """ - - # stds - if isinstance(in_features, int): - in_features = [in_features] - stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features] - weight_inits = [ - fluid.initializer.NormalInitializer(scale=std) for std in stds - ] - bias_init = fluid.initializer.ConstantInitializer(0.0) - - # param attrs - weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits] - bias_attr = fluid.ParamAttr(initializer=bias_init) - - layer = weight_norm.FC(name_scope, - size, - num_flatten_dims=num_flatten_dims, - param_attr=weight_attrs, - bias_attr=bias_attr, - act=act, - dtype=dtype) - return layer - - -def Conv1D(name_scope, - in_channels, - num_filters, - filter_size=3, - dilation=1, - groups=None, - causal=False, - std_mul=1.0, - dropout=0.0, - use_cudnn=True, - act=None, - dtype="float32"): - """ - A special Conv1D Layer, when it is used with dropout, the weight is - initialized as - normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_features))) - """ - # std - std = np.sqrt((std_mul * (1 - dropout)) / (filter_size * in_channels)) - weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std) - bias_init = fluid.initializer.ConstantInitializer(0.0) - - # param attrs - weight_attr = fluid.ParamAttr(initializer=weight_init) - bias_attr = fluid.ParamAttr(initializer=bias_init) - - layer = conv.Conv1D( - name_scope, - in_channels, - num_filters, - filter_size, - dilation, - groups=groups, - causal=causal, - param_attr=weight_attr, - bias_attr=bias_attr, - use_cudnn=use_cudnn, - act=act, - dtype=dtype) - return layer - - -def Embedding(name_scope, - num_embeddings, - embed_dim, - is_sparse=False, - is_distributed=False, - padding_idx=None, - std=0.01, - dtype="float32"): - # param attrs - weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal( - scale=std)) - layer = dg.Embedding( - name_scope, (num_embeddings, embed_dim), - padding_idx=padding_idx, - param_attr=weight_attr, - dtype=dtype) - return layer - - -class Conv1DGLU(dg.Layer): - """ - A Convolution 1D block with GLU activation. It also applys dropout for the - input x. It fuses speaker embeddings through a FC activated by softsign. It - has residual connection from the input x, and scale the output by - np.sqrt(0.5). - """ - - def __init__(self, - name_scope, - n_speakers, - speaker_dim, - in_channels, - num_filters, - filter_size, - dilation, - std_mul=4.0, - dropout=0.0, - causal=False, - residual=True, - dtype="float32"): - super(Conv1DGLU, self).__init__(name_scope, dtype=dtype) - - # conv spec - self.in_channels = in_channels - self.n_speakers = n_speakers - self.speaker_dim = speaker_dim - self.num_filters = num_filters - self.filter_size = filter_size - self.dilation = dilation - self.causal = causal - self.residual = residual - - # weight init and dropout - self.std_mul = std_mul - self.dropout = dropout - - if residual: - assert ( - in_channels == num_filters - ), "this block uses residual connection"\ - "the input_channes should equals num_filters" - - self.conv = Conv1D( - self.full_name(), - in_channels, - 2 * num_filters, - filter_size, - dilation, - causal=causal, - std_mul=std_mul, - dropout=dropout, - dtype=dtype) - - if n_speakers > 1: - assert (speaker_dim is not None - ), "speaker embed should not be null in multi-speaker case" - self.fc = Conv1D( - self.full_name(), - speaker_dim, - num_filters, - filter_size=1, - dilation=1, - causal=False, - act="softsign", - dtype=dtype) - - def forward(self, x, speaker_embed_bc1t=None): - """ - Args: - x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU - layer, where B means batch_size, C_in means the input channels - T means input time steps. - speaker_embed_bct1 (Variable): Shape(B, C_sp, 1, T), expanded - speaker embed, where C_sp means speaker embedding size. Note - that when using residual connection, the Conv1DGLU does not - change the number of channels, so out channels equals input - channels. - - Returns: - x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where - C_out means the output channels of Conv1DGLU. - """ - - residual = x - x = fluid.layers.dropout( - x, self.dropout, dropout_implementation="upscale_in_train") - x = self.conv(x) - - content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) - - if speaker_embed_bc1t is not None: - sp = self.fc(speaker_embed_bc1t) - content = content + sp - - # glu - x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content) - - if self.residual: - x = fluid.layers.scale(x + residual, np.sqrt(0.5)) - return x - - def add_input(self, x, speaker_embed_bc11=None): - """ - Inputs: - x: shape(B, num_filters, 1, time_steps) - speaker_embed_bc11: shape(B, speaker_dim, 1, time_steps) - - Outputs: - out: shape(B, num_filters, 1, time_steps), where time_steps = 1 - """ - - residual = x - - # add step input and produce step output - x = fluid.layers.dropout( - x, self.dropout, dropout_implementation="upscale_in_train") - x = self.conv.add_input(x) - - content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) - - if speaker_embed_bc11 is not None: - sp = self.fc(speaker_embed_bc11) - content = content + sp - - x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content) - - if self.residual: - x = fluid.layers.scale(x + residual, np.sqrt(0.5)) - return x - - -def Conv1DTranspose(name_scope, - in_channels, - num_filters, - filter_size, - padding=0, - stride=1, - dilation=1, - groups=None, - std_mul=1.0, - dropout=0.0, - use_cudnn=True, - act=None, - dtype="float32"): - std = np.sqrt(std_mul * (1 - dropout) / (in_channels * filter_size)) - weight_init = fluid.initializer.NormalInitializer(scale=std) - weight_attr = fluid.ParamAttr(initializer=weight_init) - bias_init = fluid.initializer.ConstantInitializer(0.0) - bias_attr = fluid.ParamAttr(initializer=bias_init) - layer = conv.Conv1DTranspose( - name_scope, - in_channels, - num_filters, - filter_size, - padding=padding, - stride=stride, - dilation=dilation, - groups=groups, - param_attr=weight_attr, - bias_attr=bias_attr, - use_cudnn=use_cudnn, - act=act, - dtype=dtype) - return layer - - -def compute_position_embedding(rad): - # rad is a transposed radius, shape(embed_dim, n_vocab) - embed_dim, n_vocab = rad.shape - - even_dims = dg.to_variable(np.arange(0, embed_dim, 2).astype("int32")) - odd_dims = dg.to_variable(np.arange(1, embed_dim, 2).astype("int32")) - - even_rads = fluid.layers.gather(rad, even_dims) - odd_rads = fluid.layers.gather(rad, odd_dims) - - sines = fluid.layers.sin(even_rads) - cosines = fluid.layers.cos(odd_rads) - - temp = fluid.layers.scatter(rad, even_dims, sines) - out = fluid.layers.scatter(temp, odd_dims, cosines) - out = fluid.layers.transpose(out, perm=[1, 0]) - return out - - -def position_encoding_init(n_position, - d_pos_vec, - position_rate=1.0, - sinusoidal=True): - """ Init the sinusoid position encoding table """ - - # keep idx 0 for padding token position encoding zero vector - position_enc = np.array([[ - position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec) - for i in range(d_pos_vec) - ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) - - if sinusoidal: - position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i - position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 - - return position_enc - - -class PositionEmbedding(dg.Layer): - def __init__(self, - name_scope, - n_position, - d_pos_vec, - position_rate=1.0, - is_sparse=False, - is_distributed=False, - param_attr=None, - max_norm=None, - padding_idx=None, - dtype="float32"): - super(PositionEmbedding, self).__init__(name_scope, dtype=dtype) - self.embed = dg.Embedding( - self.full_name(), - size=(n_position, d_pos_vec), - is_sparse=is_sparse, - is_distributed=is_distributed, - padding_idx=None, - param_attr=param_attr, - dtype=dtype) - self.set_weight( - position_encoding_init( - n_position, - d_pos_vec, - position_rate=position_rate, - sinusoidal=False).astype(dtype)) - - self._is_sparse = is_sparse - self._is_distributed = is_distributed - self._remote_prefetch = self._is_sparse and (not self._is_distributed) - if self._remote_prefetch: - assert self._is_sparse is True and self._is_distributed is False - - self._padding_idx = (-1 if padding_idx is None else padding_idx if - padding_idx >= 0 else (n_position + padding_idx)) - self._position_rate = position_rate - self._max_norm = max_norm - self._dtype = dtype - - def set_weight(self, array): - assert self.embed._w.shape == list(array.shape), "shape does not match" - self.embed._w._ivar.value().get_tensor().set( - array, fluid.framework._current_expected_place()) - - def forward(self, indices, speaker_position_rate=None): - """ - Args: - indices (Variable): Shape (B, T, 1), dtype: int64, position - indices, where B means the batch size, T means the time steps. - speaker_position_rate (Variable | float, optional), position - rate. It can be a float point number or a Variable with - shape (1,), then this speaker_position_rate is used for every - example. It can also be a Variable with shape (B, 1), which - contains a speaker position rate for each speaker. - Returns: - out (Variable): Shape(B, C_pos), position embedding, where C_pos - means position embedding size. - """ - rad = fluid.layers.transpose(self.embed._w, perm=[1, 0]) - batch_size = indices.shape[0] - - if speaker_position_rate is None: - weight = compute_position_embedding(rad) - out = self._helper.create_variable_for_type_inference(self._dtype) - self._helper.append_op( - type="lookup_table", - inputs={"Ids": indices, - "W": weight}, - outputs={"Out": out}, - attrs={ - "is_sparse": self._is_sparse, - "is_distributed": self._is_distributed, - "remote_prefetch": self._remote_prefetch, - "padding_idx": - self._padding_idx, # special value for lookup table op - }) - return out - - elif (np.isscalar(speaker_position_rate) or - isinstance(speaker_position_rate, fluid.framework.Variable) and - speaker_position_rate.shape == [1, 1]): - # # make a weight - # scale the weight (the operand for sin & cos) - if np.isscalar(speaker_position_rate): - scaled_rad = fluid.layers.scale(rad, speaker_position_rate) - else: - scaled_rad = fluid.layers.elementwise_mul( - rad, speaker_position_rate[0]) - weight = compute_position_embedding(scaled_rad) - out = self._helper.create_variable_for_type_inference(self._dtype) - self._helper.append_op( - type="lookup_table", - inputs={"Ids": indices, - "W": weight}, - outputs={"Out": out}, - attrs={ - "is_sparse": self._is_sparse, - "is_distributed": self._is_distributed, - "remote_prefetch": self._remote_prefetch, - "padding_idx": - self._padding_idx, # special value for lookup table op - }) - return out - - elif np.prod(speaker_position_rate.shape) > 1: - assert speaker_position_rate.shape == [batch_size, 1] - outputs = [] - for i in range(batch_size): - rate = speaker_position_rate[i] # rate has shape [1] - scaled_rad = fluid.layers.elementwise_mul(rad, rate) - weight = compute_position_embedding(scaled_rad) - out = self._helper.create_variable_for_type_inference( - self._dtype) - sequence = indices[i] - self._helper.append_op( - type="lookup_table", - inputs={"Ids": sequence, - "W": weight}, - outputs={"Out": out}, - attrs={ - "is_sparse": self._is_sparse, - "is_distributed": self._is_distributed, - "remote_prefetch": self._remote_prefetch, - "padding_idx": -1, - }) - outputs.append(out) - out = fluid.layers.stack(outputs) - return out - else: - raise Exception("Then you can just use position rate at init") diff --git a/modules/weight_norm.py b/modules/weight_norm.py deleted file mode 100644 index cbb0d03..0000000 --- a/modules/weight_norm.py +++ /dev/null @@ -1,863 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import numpy as np -from six.moves import reduce - -from copy import deepcopy - -import paddle -from paddle import fluid -import paddle.fluid.dygraph as dg -from paddle.fluid import core -from paddle.fluid.layers import utils -from paddle.fluid.framework import Variable -from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer - - -def _norm(p, dim): - """Computes the norm over all dimensions except dim. - It differs from pytorch implementation that it does not keep dim. - This difference is related with the broadcast mechanism in paddle. - Read elementeise_mul for more. - """ - - if dim is None: - return np.linalg.norm(p, ord=2, axis=None) - elif dim == 0: - p = np.reshape(p, newshape=(p.shape[0], -1)) - return np.linalg.norm(p, ord=2, axis=1) - elif dim == p.ndim - 1: - p = np.reshape(p, newshape=(-1, p.shape[-1])) - return np.linalg.norm(p, ord=2, axis=0) - else: - perm = list(range(p.ndim)) - perm[0] = dim - perm[dim] = 0 - return _norm(np.transpose(p, axes=perm)) - - -class FC(dg.Layer): - """ - **Fully Connected Layer** - - This function creates a fully connected layer in the network. It can take - one or multiple tensors as its inputs(input can be a list of Variable, see - Args in detail). It creates a pair of variables called (magnitude(g), - direction(V)) for each input tensor. Elementwise_mul(V, g) represents a fully connected - weight matrix from each input unit to each output unit. - The fully connected layer multiplies each input tensor - with its corresponding weight to produce an output Tensor with shape [M, `size`], - where M is batch size. If multiple input tensors are given, the results of - multiple output tensors with shape [M, `size`] will be summed up. If bias_attr - is not None, a bias variable will be created and added to the output. - Finally, if activation is not None, it will be applied to the output as well. - - When the input is single tensor: - - .. math:: - - Out = Act({X(normalize(V)g) + b}) - - When the input are multiple tensors: - - .. math:: - - Out = Act({\sum_{i=0}^{N-1}X_i(V_ig_i) + b}) - - In the above equation: - - * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable. - * :math:`X_i`: The i-th input tensor. - * :math:`V_i`: The i-th direction matrix corresponding i-th input tensor. - * :math:`g_i`: The i-th magnitude vector corresponding i-th input tensor. - * :math:`b`: The bias parameter created by this layer (if needed). - * :math:`Act`: The activation function. - * :math:`Out`: The output tensor. - - See below for an example. - - .. code-block:: text - - Given: - data_1.data = [[[0.1, 0.2], - [0.3, 0.4]]] - data_1.shape = (1, 2, 2) # 1 is batch_size - - data_2 = [[[0.1, 0.2, 0.3]]] - data_2.shape = (1, 1, 3) - - out = fluid.layers.fc(input=[data_1, data_2], size=2) - - Then: - out.data = [[0.18669507, 0.1893476]] - out.shape = (1, 2) - - Args: - name_scope(str): The name of this class. - size(int): The number of output units in this layer. - num_flatten_dims (int): The fc layer can accept an input tensor with more than - two dimensions. If this happens, the multidimensional tensor will first be flattened - into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input - tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1) - dimensions will be flatten to form the first dimension of the final matrix (height of - the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to - form the second dimension of the final matrix (width of the matrix). For example, suppose - `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. - Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1 - param_attr (ParamAttr|list of ParamAttr|None): The parameter attribute for learnable - parameters/weights of this layer. - bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias - of this layer. If it is set to False, no bias will be added to the output units. - If it is set to None, the bias is initialized zero. Default: None. - act (str|None): Activation to be applied to the output of this layer. - is_test(bool): A flag indicating whether execution is in test phase. Default: False - dtype(str): Dtype used for weight - - Raises: - ValueError: If rank of the input tensor is less than 2. - - Examples: - .. code-block:: python - - from paddle.fluid.dygraph.base import to_variable - import paddle.fluid as fluid - from paddle.fluid.dygraph import FC - import numpy as np - - data = np.random.uniform( -1, 1, [30, 10, 32] ).astype('float32') - with fluid.dygraph.guard(): - fc = FC( "fc", 64, num_flatten_dims=2) - data = to_variable( data ) - conv = fc( data ) - - """ - - def __init__(self, - name_scope, - size, - num_flatten_dims=1, - epsilon=1e-30, - param_attr=None, - bias_attr=None, - act=None, - is_test=False, - dtype="float32"): - super(FC, self).__init__(name_scope, dtype) - - self._size = size - self._num_flatten_dims = num_flatten_dims - self._epsilon = epsilon - self._dtype = dtype - self._param_attr = param_attr - self._bias_attr = bias_attr - self._act = act - self.__g = list() - self.__v = list() - - @property - def _v(self, i=0): - return self.__v[i] - - @property - def _g(self, i=0): - return self.__g[i] - - @_v.setter - def _v(self, value, i=0): - assert isinstance(value, Parameter) - self.__v[i] = value - - @_g.setter - def _g(self, value, i=0): - assert isinstance(value, Parameter) - self.__g[i] = value - - def _build_once(self, input): - i = 0 - for inp, param in self._helper.iter_inputs_and_params(input, - self._param_attr): - input_shape = inp.shape - - param_shape = [ - reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], - 1) - ] + [self._size] - self.__v.append( - self.add_parameter( - "_v%d" % i, - self.create_parameter( - attr=param, - shape=param_shape, - dtype=self._dtype, - is_bias=False))) - - magnitude_shape = param_shape[1:] - magnitude_value = np.linalg.norm(self.__v[i].numpy(), ord=2, axis=0) - - self.__g.append( - self.add_parameter( - "_g%d" % i, - self.create_parameter( - attr=fluid.ParamAttr( - initializer=fluid.initializer.NumpyArrayInitializer( - magnitude_value)), - shape=magnitude_shape, - dtype=self._dtype, - is_bias=False))) - i += 1 - - size = list([self._size]) - self._b = self.create_parameter( - attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True) - - def forward(self, input): - mul_results = list() - i = 0 - for inp, param in self._helper.iter_inputs_and_params(input, - self._param_attr): - v_norm = self._helper.create_variable_for_type_inference( - self._dtype) - v_normalized = self._helper.create_variable_for_type_inference( - self._dtype) - self._helper.append_op( - type="norm", - inputs={"X": self.__v[i]}, - outputs={"Out": v_normalized, - "Norm": v_norm}, - attrs={"axis": 0, - "epsilon": self._epsilon}) - weight = self._helper.create_variable_for_type_inference( - self._dtype) - self._helper.append_op( - type="elementwise_mul", - inputs={"X": [v_normalized], - "Y": [self.__g[i]]}, - outputs={"Out": [weight]}, - attrs={"axis": 1}) - tmp = self._helper.create_variable_for_type_inference(self._dtype) - self._helper.append_op( - type="mul", - inputs={"X": inp, - "Y": weight}, - outputs={"Out": tmp}, - attrs={ - "x_num_col_dims": self._num_flatten_dims, - "y_num_col_dims": 1 - }) - i += 1 - mul_results.append(tmp) - - if len(mul_results) == 1: - pre_bias = mul_results[0] - else: - pre_bias = self._helper.create_variable_for_type_inference( - self._dtype) - self._helper.append_op( - type="sum", - inputs={"X": mul_results}, - outputs={"Out": pre_bias}, - attrs={"use_mkldnn": False}) - - if self._b: - pre_activation = self._helper.create_variable_for_type_inference( - dtype=self._dtype) - self._helper.append_op( - type="elementwise_add", - inputs={"X": [pre_bias], - "Y": [self._b]}, - outputs={"Out": [pre_activation]}, - attrs={"axis": self._num_flatten_dims}) - else: - pre_activation = pre_bias - # Currently, we don't support inplace in dygraph mode - return self._helper.append_activation(pre_activation, act=self._act) - - -class Conv2D(dg.Layer): - """ - The convolution2D layer calculates the output based on the input, filter - and strides, paddings, dilations, groups parameters. Input and - Output are in NCHW format, where N is batch size, C is the number of - channels, H is the height of the feature, and W is the width of the feature. - Filter is in MCHW format, where M is the number of output image channels, - C is the number of input image channels, H is the height of the filter, - and W is the width of the filter. If the groups is greater than 1, - C will equal the number of input image channels divided by the groups. - Please refer to UFLDL's `convolution - ` - for more detials. - If bias attribution and activation type are provided, bias is added to the - output of the convolution, and the corresponding activation function is - applied to the final result. - - For each input :math:`X`, the equation is: - - .. math:: - - Out = \sigma ((Vg) \\ast X + b) - - Where: - - * :math:`X`: Input value, a tensor with NCHW format. - * :math:`V`: Filter direction value, a tensor with MCHW format. - * :math:`g`: Filter magnitude value, a tensor with M format. - * :math:`\\ast`: Convolution operation. - * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. - * :math:`\\sigma`: Activation function. - * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. - - Example: - - - Input: - - Input shape: :math:`(N, C_{in}, H_{in}, W_{in})` - - Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)` - - - Output: - - Output shape: :math:`(N, C_{out}, H_{out}, W_{out})` - - Where - - .. math:: - - H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\ - W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1 - - Args: - name_scope(str) : The name for this class. - num_filters(int): The number of filter. It is as same as the output - image channel. - filter_size (int|tuple|None): The filter size. If filter_size is a tuple, - it must contain two integers, (filter_size_H, filter_size_W). - Otherwise, the filter will be a square. - stride (int|tuple): The stride size. If stride is a tuple, it must - contain two integers, (stride_H, stride_W). Otherwise, the - stride_H = stride_W = stride. Default: stride = 1. - padding (int|tuple): The padding size. If padding is a tuple, it must - contain two integers, (padding_H, padding_W). Otherwise, the - padding_H = padding_W = padding. Default: padding = 0. - dilation (int|tuple): The dilation size. If dilation is a tuple, it must - contain two integers, (dilation_H, dilation_W). Otherwise, the - dilation_H = dilation_W = dilation. Default: dilation = 1. - groups (int): The groups number of the Conv2d Layer. According to grouped - convolution in Alex Krizhevsky's Deep CNN paper: when group=2, - the first half of the filters is only connected to the first half - of the input channels, while the second half of the filters is only - connected to the second half of the input channels. Default: groups=1. - param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights - of conv2d. If it is set to None or one attribute of ParamAttr, conv2d - will create ParamAttr as param_attr. If the Initializer of the param_attr - is not set, the parameter is initialized with :math:`Normal(0.0, std)`, - and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None. - bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d. - If it is set to False, no bias will be added to the output units. - If it is set to None or one attribute of ParamAttr, conv2d - will create ParamAttr as bias_attr. If the Initializer of the bias_attr - is not set, the bias is initialized zero. Default: None. - use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn - library is installed. Default: True - act (str): Activation type, if it is set to None, activation is not appended. - Default: None - - Raises: - ValueError: If the shapes of input, filter_size, stride, padding and - groups mismatch. - - Examples: - .. code-block:: python - - from paddle.fluid.dygraph.base import to_variable - import paddle.fluid as fluid - from paddle.fluid.dygraph import Conv2D - import numpy as np - - data = np.random.uniform( -1, 1, [10, 3, 32, 32] ).astype('float32') - with fluid.dygraph.guard(): - conv2d = Conv2D( "conv2d", 2, 3) - data = to_variable( data ) - conv = conv2d( data ) - - """ - - def __init__(self, - name_scope, - num_filters, - filter_size, - stride=1, - padding=0, - dilation=1, - groups=None, - param_attr=None, - bias_attr=None, - use_cudnn=True, - act=None, - epsilon=1e-30, - dtype="float32"): - assert param_attr is not False, "param_attr should not be False here." - super(Conv2D, self).__init__(name_scope, dtype) - self._groups = groups - self._stride = utils.convert_to_list(stride, 2, "stride") - self._padding = utils.convert_to_list(padding, 2, "padding") - self._dilation = utils.convert_to_list(dilation, 2, "dilation") - self._act = act - if not isinstance(use_cudnn, bool): - raise ValueError("use_cudnn should be True or False") - self._use_cudnn = use_cudnn - self._filter_size = filter_size - self._num_filters = num_filters - self._param_attr = param_attr - self._bias_attr = bias_attr - self._epsilon = epsilon - self._dtype = dtype - # if (self._num_channels == self._groups and - # num_filters % self._num_channels == 0 and not self._use_cudnn): - # self._l_type = 'depthwise_conv2d' - # else: - # TODO(jiabin): recover the usage of depthwise_conv2d when it's - # kernel fixed https://github.com/PaddlePaddle/Paddle/issues/17275 - self._l_type = "conv2d" - - def _build_once(self, input): - self._num_channels = input.shape[1] - if self._groups is None: - num_filter_channels = self._num_channels - else: - if self._num_channels % self._groups != 0: - raise ValueError("num_channels must be divisible by groups.") - num_filter_channels = self._num_channels // self._groups - filter_size = utils.convert_to_list(self._filter_size, 2, "filter_size") - filter_shape = [self._num_filters, int(num_filter_channels) - ] + filter_size - - def _get_default_param_initializer(): - filter_elem_num = filter_size[0] * filter_size[ - 1] * self._num_channels - std = (2.0 / filter_elem_num)**0.5 - return Normal(0.0, std, 0) - - # weight_v - self._filter_param_v = self.create_parameter( - attr=self._param_attr, - shape=filter_shape, - dtype=self._dtype, - default_initializer=_get_default_param_initializer()) - - # weight_g - norm_value = _norm( - self._filter_param_v.numpy(), dim=0) # CAUTION: hard-code - self._filter_param_g = self.create_parameter( - attr=fluid.ParamAttr( - initializer=fluid.initializer.NumpyArrayInitializer( - norm_value)), - shape=norm_value.shape, - dtype=self._dtype, - default_initializer=_get_default_param_initializer()) - - if self._use_cudnn: - self.create_variable( - name="kCUDNNFwdAlgoCache", - persistable=True, - type=core.VarDesc.VarType.RAW) - self.create_variable( - name="kCUDNNBwdDataAlgoCache", - persistable=True, - type=core.VarDesc.VarType.RAW) - self.create_variable( - name="kCUDNNBwdFilterAlgoCache", - persistable=True, - type=core.VarDesc.VarType.RAW) - - self._bias_param = self.create_parameter( - attr=self._bias_attr, - shape=[self._num_filters], - dtype=self._dtype, - is_bias=True) - - def forward(self, input): - matrix = self._helper.create_variable_for_type_inference(self._dtype) - tmp = self._helper.create_variable_for_type_inference(self._dtype) - new_shape = [ - self._filter_param_v.shape[0], - reduce(lambda x, y: x * y, self._filter_param_v.shape[1:], 1), - ] - - self._helper.append_op( - type="reshape2", - inputs={"X": self._filter_param_v}, - attrs={"shape": new_shape}, - outputs={"Out": matrix, - "XShape": tmp}) - - m_norm = self._helper.create_variable_for_type_inference(self._dtype) - m_normalized = self._helper.create_variable_for_type_inference( - self._dtype) - self._helper.append_op( - type="norm", - inputs={"X": matrix}, - outputs={"Out": m_normalized, - "Norm": m_norm}, - attrs={"axis": 1, - "epsilon": self._epsilon}) - - v_normalized = self._helper.create_variable_for_type_inference( - self._dtype) - tmp2 = self._helper.create_variable_for_type_inference(self._dtype) - self._helper.append_op( - type="reshape2", - inputs={"X": m_normalized}, - attrs={"shape": self._filter_param_v.shape}, - outputs={"Out": v_normalized, - "XShape": tmp2}) - - filter_param = self._helper.create_variable_for_type_inference( - self._dtype) - self._helper.append_op( - type="elementwise_mul", - inputs={"X": [v_normalized], - "Y": [self._filter_param_g]}, - outputs={"Out": [filter_param]}, - attrs={"axis": 0}, # CAUTION: hard-code - ) - - pre_bias = self._helper.create_variable_for_type_inference( - dtype=self._dtype) - - self._helper.append_op( - type=self._l_type, - inputs={"Input": input, - "Filter": filter_param}, - outputs={"Output": pre_bias}, - attrs={ - "strides": self._stride, - "paddings": self._padding, - "dilations": self._dilation, - "groups": self._groups if self._groups else 1, - "use_cudnn": self._use_cudnn, - "use_mkldnn": False, - }) - - if self._bias_param is not None: - pre_act = self._helper.create_variable_for_type_inference( - dtype=self._dtype) - self._helper.append_op( - type="elementwise_add", - inputs={"X": [pre_bias], - "Y": [self._bias_param]}, - outputs={"Out": [pre_act]}, - attrs={"axis": 1}) - else: - pre_act = pre_bias - - # Currently, we don't support inplace in dygraph mode - return self._helper.append_activation(pre_act, act=self._act) - - -class Conv2DTranspose(dg.Layer): - """ - **Convlution2D transpose layer** - - The convolution2D transpose layer calculates the output based on the input, - filter, and dilations, strides, paddings. Input(Input) and output(Output) - are in NCHW format. Where N is batch size, C is the number of channels, - H is the height of the feature, and W is the width of the feature. - Parameters(dilations, strides, paddings) are two elements. These two elements - represent height and width, respectively. The details of convolution transpose - layer, please refer to the following explanation and references - `therein `_. - If bias attribution and activation type are provided, bias is added to - the output of the convolution, and the corresponding activation function - is applied to the final result. - - For each input :math:`X`, the equation is: - - .. math:: - - Out = \sigma ((Vg) \\ast X + b) - - Where: - - * :math:`X`: Input value, a tensor with NCHW format. - * :math:`V`: Filter value, a tensor with MCHW format. - * :math:`g`: Filter value, a tensor with M format. - * :math:`\\ast`: Convolution operation. - * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. - * :math:`\\sigma`: Activation function. - * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. - - Example: - - - Input: - - Input shape: :math:`(N, C_{in}, H_{in}, W_{in})` - - Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)` - - - Output: - - Output shape: :math:`(N, C_{out}, H_{out}, W_{out})` - - Where - - .. math:: - - H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\ - W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\ - H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\ - W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] ) - - Args: - name_scope(str): The name of this class. - num_filters(int): The number of the filter. It is as same as the output - image channel. - output_size(int|tuple|None): The output image size. If output size is a - tuple, it must contain two integers, (image_H, image_W). None if use - filter_size, padding, and stride to calculate output_size. - if output_size and filter_size are specified at the same time, They - should follow the formula above. Default: None. - filter_size(int|tuple|None): The filter size. If filter_size is a tuple, - it must contain two integers, (filter_size_H, filter_size_W). - Otherwise, the filter will be a square. None if use output size to - calculate filter_size. Default: None. - padding(int|tuple): The padding size. If padding is a tuple, it must - contain two integers, (padding_H, padding_W). Otherwise, the - padding_H = padding_W = padding. Default: padding = 0. - stride(int|tuple): The stride size. If stride is a tuple, it must - contain two integers, (stride_H, stride_W). Otherwise, the - stride_H = stride_W = stride. Default: stride = 1. - dilation(int|tuple): The dilation size. If dilation is a tuple, it must - contain two integers, (dilation_H, dilation_W). Otherwise, the - dilation_H = dilation_W = dilation. Default: dilation = 1. - groups(int): The groups number of the Conv2d transpose layer. Inspired by - grouped convolution in Alex Krizhevsky's Deep CNN paper, in which - when group=2, the first half of the filters is only connected to the - first half of the input channels, while the second half of the - filters is only connected to the second half of the input channels. - Default: groups = 1. - param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights - of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose - will create ParamAttr as param_attr. If the Initializer of the param_attr - is not set, the parameter is initialized with Xavier. Default: None. - bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose. - If it is set to False, no bias will be added to the output units. - If it is set to None or one attribute of ParamAttr, conv2d_transpose - will create ParamAttr as bias_attr. If the Initializer of the bias_attr - is not set, the bias is initialized zero. Default: None. - use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn - library is installed. Default: True. - act (str): Activation type, if it is set to None, activation is not appended. - Default: None. - - Returns: - Variable: The tensor variable storing the convolution transpose result. - - Raises: - ValueError: If the shapes of input, filter_size, stride, padding and - groups mismatch. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy - - with fluid.dygraph.guard(): - data = numpy.random.random((3, 32, 32)).astype('float32') - conv2DTranspose = fluid.dygraph.nn.Conv2DTranspose( - 'Conv2DTranspose', num_filters=2, filter_size=3) - ret = conv2DTranspose(fluid.dygraph.base.to_variable(data)) - - """ - - def __init__(self, - name_scope, - num_filters, - output_size=None, - filter_size=None, - padding=0, - stride=1, - dilation=1, - groups=None, - param_attr=None, - bias_attr=None, - use_cudnn=True, - epsilon=1e-30, - act=None, - dtype="float32"): - super(Conv2DTranspose, self).__init__(name_scope, dtype) - assert (param_attr is not False - ), "param_attr should not be False in conv2d_transpose." - self._param_attr = param_attr - self._bias_attr = bias_attr - self._groups = groups - self._num_filters = num_filters - self._use_cudnn = use_cudnn - self._padding = padding - self._stride = stride - self._dilation = dilation - self._filter_size = filter_size - self._output_size = output_size - self._op_type = "conv2d_transpose" - self._epsilon = epsilon - - def _build_once(self, input): - input_channel = input.shape[1] - if (input_channel == self._groups and - self._num_filters == input_channel and not self._use_cudnn): - self._op_type = "depthwise_conv2d_transpose" - - if not isinstance(input, Variable): - raise TypeError("Input of conv2d_transpose must be Variable") - - self._padding = utils.convert_to_list(self._padding, 2, "padding") - self._stride = utils.convert_to_list(self._stride, 2, "stride") - self._dilation = utils.convert_to_list(self._dilation, 2, "dilation") - - if not isinstance(self._use_cudnn, bool): - raise ValueError("use_cudnn should be True or False") - - if self._filter_size is None: - if self._output_size is None: - raise ValueError( - "output_size must be set when filter_size is None") - if isinstance(self._output_size, int): - self._output_size = [self._output_size, self._output_size] - - h_in = input.shape[2] - w_in = input.shape[3] - - filter_size_h = (self._output_size[0] - - (h_in - 1) * self._stride[0] + 2 * self._padding[0] - - 1) // self._dilation[0] + 1 - filter_size_w = (self._output_size[1] - - (w_in - 1) * self._stride[1] + 2 * self._padding[1] - - 1) // self._dilation[1] + 1 - self._filter_size = [filter_size_h, filter_size_w] - else: - self._filter_size = utils.convert_to_list( - self._filter_size, 2, "conv2d_transpose.filter_size") - - if self._output_size is None: - self._output_size = [] - elif isinstance(self._output_size, list) or isinstance( - self._output_size, int): - self._output_size = utils.convert_to_list(self._output_size, 2, - "output_size") - else: - raise ValueError("output_size should be list or int") - self._padding = utils.convert_to_list(self._padding, 2, "padding") - self._groups = 1 if self._groups is None else self._groups - filter_shape = [ - input_channel, - self._num_filters // self._groups, - ] + self._filter_size - - # img filter v (direction) - self._img_filter_v = self.create_parameter( - dtype=input.dtype, shape=filter_shape, attr=self._param_attr) - - # img filter g (magnitude) - img_filter_magnitude = _norm( - self._img_filter_v.numpy(), dim=0) # CAUTION: hard-code - self._img_filter_g = self.create_parameter( - dtype=input.dtype, - shape=img_filter_magnitude.shape, - attr=fluid.ParamAttr( - initializer=NumpyArrayInitializer(img_filter_magnitude))) - - self._img_bias = self.create_parameter( - attr=self._bias_attr, - shape=[self._num_filters], - dtype=self._dtype, - is_bias=True) - - def forward(self, input): - matrix = self._helper.create_variable_for_type_inference(self._dtype) - tmp = self._helper.create_variable_for_type_inference(self._dtype) - new_shape = [ - self._img_filter_v.shape[0], - reduce(lambda x, y: x * y, self._img_filter_v.shape[1:], 1), - ] - - self._helper.append_op( - type="reshape2", - inputs={"X": self._img_filter_v}, - attrs={"shape": new_shape}, - outputs={"Out": matrix, - "XShape": tmp}) - - m_norm = self._helper.create_variable_for_type_inference(self._dtype) - m_normalized = self._helper.create_variable_for_type_inference( - self._dtype) - self._helper.append_op( - type="norm", - inputs={"X": matrix}, - outputs={"Out": m_normalized, - "Norm": m_norm}, - attrs={"axis": 1, - "epsilon": self._epsilon}) - - v_normalized = self._helper.create_variable_for_type_inference( - self._dtype) - tmp2 = self._helper.create_variable_for_type_inference(self._dtype) - self._helper.append_op( - type="reshape2", - inputs={"X": m_normalized}, - attrs={"shape": self._img_filter_v.shape}, - outputs={"Out": v_normalized, - "XShape": tmp2}) - - img_filter = self._helper.create_variable_for_type_inference( - self._dtype) - self._helper.append_op( - type="elementwise_mul", - inputs={"X": [v_normalized], - "Y": [self._img_filter_g]}, - outputs={"Out": [img_filter]}, - attrs={"axis": 0}, # CAUTION: hard-code - ) - - pre_bias = self._helper.create_variable_for_type_inference( - dtype=input.dtype) - self._helper.append_op( - type=self._op_type, - inputs={"Input": [input], - "Filter": [img_filter]}, - outputs={"Output": pre_bias}, - attrs={ - "output_size": self._output_size, - "strides": self._stride, - "paddings": self._padding, - "dilations": self._dilation, - "groups": self._groups, - "use_cudnn": self._use_cudnn, - }) - - if self._img_bias is not None: - pre_act = self._helper.create_variable_for_type_inference( - dtype=self._dtype) - self._helper.append_op( - type="elementwise_add", - inputs={"X": [pre_bias], - "Y": [self._img_bias]}, - outputs={"Out": [pre_act]}, - attrs={"axis": 1}) - else: - pre_act = pre_bias - - out = self._helper.append_activation(pre_act) - return out