ParakeetEricRoss/examples/speedyspeech/baker/frontend.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from pathlib import Path

import numpy as np
import paddle
from pypinyin import lazy_pinyin, Style
import jieba
import phkit
phkit.initialize()
from parakeet.frontend.vocab import Vocab

file_dir = Path(__file__).parent.resolve()
with open(file_dir / "phones.txt", 'rt') as f:
    phones = [line.strip() for line in f.readlines()]

with open(file_dir / "tones.txt", 'rt') as f:
    tones = [line.strip() for line in f.readlines()]
voc_phones = Vocab(phones, start_symbol=None, end_symbol=None)
voc_tones = Vocab(tones, start_symbol=None, end_symbol=None)


def segment(sentence):
    segments = re.split(r'[：，；。？！]', sentence)
    segments = [seg for seg in segments if len(seg)]
    return segments


def g2p(sentence):
    segments = segment(sentence)
    phones = []
    phones.append('sil')
    tones = []
    tones.append('0')

    for seg in segments:
        seg = jieba.lcut(seg)
        initials = lazy_pinyin(
            seg, neutral_tone_with_five=True, style=Style.INITIALS)
        finals = lazy_pinyin(
            seg, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
        for c, v in zip(initials, finals):
            # NOTE: post process for pypinyin outputs
            # we discriminate i, ii and iii
            if re.match(r'i\d', v):
                if c in ['z', 'c', 's']:
                    v = re.sub('i', 'ii', v)
                elif c in ['zh', 'ch', 'sh', 'r']:
                    v = re.sub('i', 'iii', v)
            if c:
                phones.append(c)
                tones.append('0')
            if v:
                phones.append(v[:-1])
                tones.append(v[-1])
        phones.append('sp')
        tones.append('0')
    phones[-1] = 'sil'
    tones[-1] = '0'
    return (phones, tones)


def p2id(voc, phonemes):
    phone_ids = [voc.lookup(item) for item in phonemes]
    return np.array(phone_ids, np.int64)


def t2id(voc, tones):
    tone_ids = [voc.lookup(item) for item in tones]
    return np.array(tone_ids, np.int64)


def text_analysis(sentence):
    phonemes, tones = g2p(sentence)
    print(sentence)
    print([p + t if t != '0' else p for p, t in zip(phonemes, tones)])
    phone_ids = p2id(voc_phones, phonemes)
    tone_ids = t2id(voc_tones, tones)
    phones = paddle.to_tensor(phone_ids)
    tones = paddle.to_tensor(tone_ids)
    return phones, tones
add e2e inference script 2021-07-13 10:51:17 +08:00			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`import re`
1. use relative path in metadata.jsonl; 2. support key=value format to pass extra command line arguments to modify config values; 3. use path relative to the config.py to locate the default config. 2021-08-16 10:01:51 +08:00			`from pathlib import Path`

add e2e inference script 2021-07-13 10:51:17 +08:00			`import numpy as np`
			`import paddle`
			`from pypinyin import lazy_pinyin, Style`
			`import jieba`
			`import phkit`
			`phkit.initialize()`
			`from parakeet.frontend.vocab import Vocab`

1. use relative path in metadata.jsonl; 2. support key=value format to pass extra command line arguments to modify config values; 3. use path relative to the config.py to locate the default config. 2021-08-16 10:01:51 +08:00			`file_dir = Path(__file__).parent.resolve()`
			`with open(file_dir / "phones.txt", 'rt') as f:`
add e2e inference script 2021-07-13 10:51:17 +08:00			`phones = [line.strip() for line in f.readlines()]`

1. use relative path in metadata.jsonl; 2. support key=value format to pass extra command line arguments to modify config values; 3. use path relative to the config.py to locate the default config. 2021-08-16 10:01:51 +08:00			`with open(file_dir / "tones.txt", 'rt') as f:`
add e2e inference script 2021-07-13 10:51:17 +08:00			`tones = [line.strip() for line in f.readlines()]`
			`voc_phones = Vocab(phones, start_symbol=None, end_symbol=None)`
			`voc_tones = Vocab(tones, start_symbol=None, end_symbol=None)`


			`def segment(sentence):`
			`segments = re.split(r'[：，；。？！]', sentence)`
			`segments = [seg for seg in segments if len(seg)]`
			`return segments`


			`def g2p(sentence):`
			`segments = segment(sentence)`
			`phones = []`
			`phones.append('sil')`
			`tones = []`
			`tones.append('0')`

			`for seg in segments:`
			`seg = jieba.lcut(seg)`
			`initials = lazy_pinyin(`
			`seg, neutral_tone_with_five=True, style=Style.INITIALS)`
			`finals = lazy_pinyin(`
			`seg, neutral_tone_with_five=True, style=Style.FINALS_TONE3)`
			`for c, v in zip(initials, finals):`
			`# NOTE: post process for pypinyin outputs`
			`# we discriminate i, ii and iii`
			`if re.match(r'i\d', v):`
			`if c in ['z', 'c', 's']:`
			`v = re.sub('i', 'ii', v)`
			`elif c in ['zh', 'ch', 'sh', 'r']:`
			`v = re.sub('i', 'iii', v)`
			`if c:`
			`phones.append(c)`
			`tones.append('0')`
			`if v:`
			`phones.append(v[:-1])`
			`tones.append(v[-1])`
			`phones.append('sp')`
			`tones.append('0')`
			`phones[-1] = 'sil'`
			`tones[-1] = '0'`
			`return (phones, tones)`


			`def p2id(voc, phonemes):`
			`phone_ids = [voc.lookup(item) for item in phonemes]`
			`return np.array(phone_ids, np.int64)`


			`def t2id(voc, tones):`
			`tone_ids = [voc.lookup(item) for item in tones]`
			`return np.array(tone_ids, np.int64)`


			`def text_analysis(sentence):`
			`phonemes, tones = g2p(sentence)`
			`print(sentence)`
			`print([p + t if t != '0' else p for p, t in zip(phonemes, tones)])`
			`phone_ids = p2id(voc_phones, phonemes)`
			`tone_ids = t2id(voc_tones, tones)`
			`phones = paddle.to_tensor(phone_ids)`
			`tones = paddle.to_tensor(tone_ids)`
			`return phones, tones`