194 lines
5.8 KiB
Python
194 lines
5.8 KiB
Python
|
from paddle.io import Dataset
|
|||
|
from pathlib import Path
|
|||
|
import re
|
|||
|
import pickle
|
|||
|
import yaml
|
|||
|
import tqdm
|
|||
|
from parakeet.audio import AudioProcessor, LogMagnitude
|
|||
|
import numpy as np
|
|||
|
import multiprocessing as mp
|
|||
|
from functools import partial
|
|||
|
|
|||
|
zh_pattern = re.compile("[\u4e00-\u9fa5]")
|
|||
|
|
|||
|
_tones = {'<pad>', '<s>', '</s>', '0', '1', '2', '3', '4', '5'}
|
|||
|
|
|||
|
_pauses = {'#1', '#2', '#3', '#4'}
|
|||
|
|
|||
|
_initials = {
|
|||
|
'b', 'p', 'm', 'f',
|
|||
|
'd', 't', 'n', 'l',
|
|||
|
'g', 'k', 'h',
|
|||
|
'j', 'q', 'x',
|
|||
|
'zh', 'ch', 'sh',
|
|||
|
'r',
|
|||
|
'z', 'c', 's',
|
|||
|
}
|
|||
|
|
|||
|
_finals = {
|
|||
|
'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang', 'eng', 'er',
|
|||
|
'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian', 'ien', 'iang', 'ieng',
|
|||
|
'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang', 'ueng',
|
|||
|
'v', 've', 'van', 'ven', 'veng',
|
|||
|
}
|
|||
|
|
|||
|
_ernized_symbol = {'&r'}
|
|||
|
|
|||
|
_specials = {'<pad>', '<unk>', '<s>', '</s>'}
|
|||
|
|
|||
|
_phones = _initials | _finals | _ernized_symbol | _specials | _pauses
|
|||
|
|
|||
|
def is_zh(word):
|
|||
|
global zh_pattern
|
|||
|
match = zh_pattern.search(word)
|
|||
|
return match is not None
|
|||
|
|
|||
|
|
|||
|
def ernized(syllable):
|
|||
|
return syllable[:2] != "er" and syllable[-2] == 'r'
|
|||
|
|
|||
|
def convert(syllable):
|
|||
|
# expansion of o -> uo
|
|||
|
syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
|
|||
|
# syllable = syllable.replace("bo", "buo").replace("po", "puo").replace("mo", "muo").replace("fo", "fuo")
|
|||
|
# expansion for iong, ong
|
|||
|
syllable = syllable.replace("iong", "veng").replace("ong", "ueng")
|
|||
|
|
|||
|
# expansion for ing, in
|
|||
|
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
|
|||
|
|
|||
|
# expansion for un, ui, iu
|
|||
|
syllable = syllable.replace("un", "uen").replace("ui", "uei").replace("iu", "iou")
|
|||
|
|
|||
|
# rule for variants of i
|
|||
|
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
|
|||
|
.replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\
|
|||
|
.replace("ri", "riii")
|
|||
|
|
|||
|
# rule for y preceding i, u
|
|||
|
syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
|
|||
|
|
|||
|
# rule for w
|
|||
|
syllable = syllable.replace("wu", "u").replace("w", "u")
|
|||
|
|
|||
|
# rule for v following j, q, x
|
|||
|
syllable = syllable.replace("ju", "jv").replace("qu", "qv").replace("xu", "xv")
|
|||
|
|
|||
|
return syllable
|
|||
|
|
|||
|
def split_syllable(syllable:str):
|
|||
|
if syllable.startswith("#"):
|
|||
|
# syllable, tone
|
|||
|
return [syllable], ['0']
|
|||
|
|
|||
|
tone = syllable[-1]
|
|||
|
syllable = convert(syllable[:-1])
|
|||
|
|
|||
|
phones = []
|
|||
|
tones = []
|
|||
|
|
|||
|
global _initials
|
|||
|
if syllable[:2] in _initials:
|
|||
|
phones.append(syllable[:2])
|
|||
|
tones.append('0')
|
|||
|
phones.append(syllable[2:])
|
|||
|
tones.append(tone)
|
|||
|
elif syllable[0] in _initials:
|
|||
|
phones.append(syllable[0])
|
|||
|
tones.append('0')
|
|||
|
phones.append(syllable[1:])
|
|||
|
tones.append(tone)
|
|||
|
else:
|
|||
|
phones.append(syllable)
|
|||
|
tones.append(tone)
|
|||
|
return phones, tones
|
|||
|
|
|||
|
|
|||
|
def load_baker_transcription(text:str, pinyin:str):
|
|||
|
sentence_id, text = text.strip().split("\t")
|
|||
|
syllables = pinyin.strip().split()
|
|||
|
|
|||
|
j = 0
|
|||
|
i = 0
|
|||
|
results = []
|
|||
|
while i < len(syllables) and j < len(text):
|
|||
|
if is_zh(text[j]):
|
|||
|
if not ernized(syllables[i]):
|
|||
|
results.append(syllables[i])
|
|||
|
else:
|
|||
|
results.append(syllables[i][:-2] + syllables[i][-1])
|
|||
|
results.append('&r5')
|
|||
|
j += 2 if ernized(syllables[i]) else 1
|
|||
|
i += 1
|
|||
|
elif text[j] == "#":
|
|||
|
results.append(text[j: j+2])
|
|||
|
j += 2
|
|||
|
else:
|
|||
|
j += 1
|
|||
|
|
|||
|
if j < len(text):
|
|||
|
if text[j] == "#":
|
|||
|
results.append(text[j: j+2])
|
|||
|
j += 2
|
|||
|
else:
|
|||
|
j += 1
|
|||
|
|
|||
|
phones = []
|
|||
|
tones = []
|
|||
|
for syllable in results:
|
|||
|
p, t = split_syllable(syllable)
|
|||
|
phones.extend(p)
|
|||
|
tones.extend(t)
|
|||
|
for p in phones:
|
|||
|
assert p in _phones, p
|
|||
|
return {"sentence_id": sentence_id, "text": text, "syllables": results, "phones": phones, "tones": tones}
|
|||
|
|
|||
|
def process_utterance(record, p, n, dataset_root, mel_dir):
|
|||
|
audio_path = (dataset_root / "Wave" / record["sentence_id"]).with_suffix(".wav")
|
|||
|
mel = p.mel_spectrogram(p.read_wav(str(audio_path)))
|
|||
|
mel = n.transform(mel)
|
|||
|
np.save(str(mel_dir / record["sentence_id"]), mel)
|
|||
|
|
|||
|
def process_baker(dataset_root, output_dir):
|
|||
|
dataset_root = Path(dataset_root).expanduser()
|
|||
|
output_dir = Path(output_dir).expanduser()
|
|||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|||
|
mel_dir = output_dir / "mel"
|
|||
|
mel_dir.mkdir(parents=True, exist_ok=True)
|
|||
|
|
|||
|
p = AudioProcessor(22050, 1024, 1024, 256, f_max=8000)
|
|||
|
n = LogMagnitude(1e-5)
|
|||
|
prosody_label_path = dataset_root / "ProsodyLabeling" / "000001-010000.txt"
|
|||
|
with open(prosody_label_path, 'rt') as f:
|
|||
|
lines = [line.strip() for line in f]
|
|||
|
|
|||
|
records = []
|
|||
|
for i in range(0, len(lines), 2):
|
|||
|
records.append(
|
|||
|
(lines[i], lines[i+1])
|
|||
|
)
|
|||
|
|
|||
|
processed_records = []
|
|||
|
for record in tqdm.tqdm(records):
|
|||
|
if 'B' in record[0] or 'P' in record[1]:
|
|||
|
continue
|
|||
|
new_record = load_baker_transcription(*record)
|
|||
|
processed_records.append(new_record)
|
|||
|
#print(new_record)
|
|||
|
|
|||
|
with open(output_dir / "metadata.pickle", 'wb') as f:
|
|||
|
pickle.dump(processed_records, f)
|
|||
|
|
|||
|
with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
|
|||
|
yaml.safe_dump(processed_records, f, default_flow_style=None, allow_unicode=True)
|
|||
|
|
|||
|
print("metadata done!")
|
|||
|
|
|||
|
func = partial(process_utterance, p=p, n=n, dataset_root=dataset_root, mel_dir=mel_dir)
|
|||
|
with mp.Pool(16) as pool:
|
|||
|
list(tqdm.tqdm(pool.imap(func, processed_records), desc="Baker", total=len(processed_records)))
|
|||
|
|
|||
|
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
process_baker("~/datasets/BZNSYP", "~/datasets/processed_BZNSYP")
|