clean code for data processing
This commit is contained in:
parent
56f2552201
commit
e8a9a118bb
|
@ -1,49 +1,55 @@
|
||||||
|
import pickle
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import librosa
|
|
||||||
from paddle.io import Dataset
|
from paddle.io import Dataset
|
||||||
import pickle
|
|
||||||
from parakeet.frontend import Vocab
|
from parakeet.frontend import Vocab
|
||||||
from parakeet.data import batch_text_id, batch_spec
|
from parakeet.data import batch_text_id, batch_spec
|
||||||
|
|
||||||
from preprocess_transcription import _phones, _tones
|
from preprocess_transcription import _phones, _tones
|
||||||
voc_phones = Vocab(sorted(list(_phones)))
|
|
||||||
print(voc_phones)
|
|
||||||
voc_tones = Vocab(sorted(list(_tones)))
|
|
||||||
print(voc_tones)
|
|
||||||
|
|
||||||
# use yaml to store preprocessed aishell3 dataset
|
voc_phones = Vocab(sorted(list(_phones)))
|
||||||
|
print("vocab_phones:\n", voc_phones)
|
||||||
|
voc_tones = Vocab(sorted(list(_tones)))
|
||||||
|
print("vocab+tones:\n", voc_tones)
|
||||||
|
|
||||||
|
|
||||||
class AiShell3(Dataset):
|
class AiShell3(Dataset):
|
||||||
|
"""Processed AiShell3 dataset."""
|
||||||
def __init__(self, root):
|
def __init__(self, root):
|
||||||
|
super().__init__()
|
||||||
self.root = Path(root).expanduser()
|
self.root = Path(root).expanduser()
|
||||||
self.embed_dir = self.root / "embed"
|
self.embed_dir = self.root / "embed"
|
||||||
self.mel_dir = self.root / "mel"
|
self.mel_dir = self.root / "mel"
|
||||||
|
|
||||||
with open (self.root / "metadata.pickle", 'rb') as f:
|
with open(self.root / "metadata.pickle", 'rb') as f:
|
||||||
self.records = pickle.load(f)
|
self.records = pickle.load(f)
|
||||||
|
|
||||||
def __getitem__(self, index):
|
def __getitem__(self, index):
|
||||||
metadatum = self.records[index]
|
metadatum = self.records[index]
|
||||||
sentence_id = metadatum["sentence_id"]
|
sentence_id = metadatum["sentence_id"]
|
||||||
speaker_id = sentence_id[:7]
|
speaker_id = sentence_id[:7]
|
||||||
phones = metadatum["phones"]
|
phones = metadatum["phones"]
|
||||||
tones = metadatum["tones"]
|
tones = metadatum["tones"]
|
||||||
phones = np.array([voc_phones.lookup(item) for item in phones], dtype=np.int64)
|
phones = np.array([voc_phones.lookup(item) for item in phones],
|
||||||
tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64)
|
dtype=np.int64)
|
||||||
|
tones = np.array([voc_tones.lookup(item) for item in tones],
|
||||||
|
dtype=np.int64)
|
||||||
mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
|
mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
|
||||||
embed = np.load(str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
|
embed = np.load(
|
||||||
|
str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
|
||||||
return phones, tones, mel, embed
|
return phones, tones, mel, embed
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.records)
|
return len(self.records)
|
||||||
|
|
||||||
|
|
||||||
def collate_aishell3_examples(examples):
|
def collate_aishell3_examples(examples):
|
||||||
phones, tones, mel, embed = list(zip(*examples))
|
phones, tones, mel, embed = list(zip(*examples))
|
||||||
|
|
||||||
text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
|
text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
|
||||||
spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
|
spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
|
||||||
T_dec = np.max(spec_lengths)
|
stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths,
|
||||||
stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
|
-1)).astype(np.float32)
|
||||||
phones, _ = batch_text_id(phones)
|
phones, _ = batch_text_id(phones)
|
||||||
tones, _ = batch_text_id(tones)
|
tones, _ = batch_text_id(tones)
|
||||||
mel, _ = batch_spec(mel)
|
mel, _ = batch_spec(mel)
|
||||||
|
@ -53,13 +59,13 @@ def collate_aishell3_examples(examples):
|
||||||
# (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T)
|
# (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T)
|
||||||
return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens
|
return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
dataset = AiShell3("~/datasets/aishell3/train")
|
dataset = AiShell3("~/datasets/aishell3/train")
|
||||||
example = dataset[0]
|
example = dataset[0]
|
||||||
|
|
||||||
examples = [dataset[i] for i in range(10)]
|
examples = [dataset[i] for i in range(10)]
|
||||||
batch = collate_aishell3_examples(examples)
|
batch = collate_aishell3_examples(examples)
|
||||||
|
|
||||||
for field in batch:
|
for field in batch:
|
||||||
print(field.shape, field.dtype)
|
print(field.shape, field.dtype)
|
||||||
|
|
||||||
|
|
|
@ -2,9 +2,9 @@ from chinese_text_to_pinyin import convert_to_pinyin
|
||||||
from chinese_phonology import convert, split_syllable
|
from chinese_phonology import convert, split_syllable
|
||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
|
||||||
def convert_sentence(text: str) -> List[Tuple[str]]:
|
def convert_sentence(text: str) -> List[Tuple[str]]:
|
||||||
syllables = convert_to_pinyin(text)
|
syllables = convert_to_pinyin(text)
|
||||||
syllables = [item[0] for item in syllables]
|
|
||||||
phones = []
|
phones = []
|
||||||
tones = []
|
tones = []
|
||||||
for syllable in syllables:
|
for syllable in syllables:
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from pypinyin import pinyin, Style
|
from pypinyin import lazy_pinyin, Style
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
@ -6,6 +6,7 @@ def convert_to_pinyin(text: str) -> List[str]:
|
||||||
"""convert text into list of syllables, other characters that are not chinese, thus
|
"""convert text into list of syllables, other characters that are not chinese, thus
|
||||||
cannot be converted to pinyin are splited.
|
cannot be converted to pinyin are splited.
|
||||||
"""
|
"""
|
||||||
syllables = pinyin(text, style=Style.TONE3, neutral_tone_with_five=True)
|
syllables = lazy_pinyin(text,
|
||||||
|
style=Style.TONE3,
|
||||||
|
neutral_tone_with_five=True)
|
||||||
return syllables
|
return syllables
|
||||||
|
|
||||||
|
|
|
@ -23,8 +23,8 @@ _C.data = CN(
|
||||||
n_fft=1024, # fft frame size
|
n_fft=1024, # fft frame size
|
||||||
win_length=1024, # window size
|
win_length=1024, # window size
|
||||||
hop_length=256, # hop size between ajacent frame
|
hop_length=256, # hop size between ajacent frame
|
||||||
f_max=8000, # Hz, max frequency when converting to mel
|
fmax=8000, # Hz, max frequency when converting to mel
|
||||||
f_min=0, # Hz, min frequency when converting to mel
|
fmin=0, # Hz, min frequency when converting to mel
|
||||||
d_mels=80, # mel bands
|
d_mels=80, # mel bands
|
||||||
padding_idx=0, # text embedding's padding index
|
padding_idx=0, # text embedding's padding index
|
||||||
))
|
))
|
||||||
|
@ -38,8 +38,10 @@ _C.model = CN(
|
||||||
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
|
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
|
||||||
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
|
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
|
||||||
d_prenet=256, # hidden size of decoder prenet
|
d_prenet=256, # hidden size of decoder prenet
|
||||||
d_attention_rnn=1024, # hidden size of the first rnn layer in tacotron2 decoder
|
d_attention_rnn=
|
||||||
d_decoder_rnn=1024, # hidden size of the second rnn layer in tacotron2 decoder
|
1024, # hidden size of the first rnn layer in tacotron2 decoder
|
||||||
|
d_decoder_rnn=
|
||||||
|
1024, # hidden size of the second rnn layer in tacotron2 decoder
|
||||||
d_attention=128, # hidden size of decoder location linear layer
|
d_attention=128, # hidden size of decoder location linear layer
|
||||||
attention_filters=32, # number of filter in decoder location conv layer
|
attention_filters=32, # number of filter in decoder location conv layer
|
||||||
attention_kernel_size=31, # kernel size of decoder location conv layer
|
attention_kernel_size=31, # kernel size of decoder location conv layer
|
||||||
|
@ -48,8 +50,10 @@ _C.model = CN(
|
||||||
postnet_conv_layers=5, # number of conv layer in decoder postnet
|
postnet_conv_layers=5, # number of conv layer in decoder postnet
|
||||||
p_encoder_dropout=0.5, # droput probability in encoder
|
p_encoder_dropout=0.5, # droput probability in encoder
|
||||||
p_prenet_dropout=0.5, # droput probability in decoder prenet
|
p_prenet_dropout=0.5, # droput probability in decoder prenet
|
||||||
p_attention_dropout=0.1, # droput probability of first rnn layer in decoder
|
p_attention_dropout=
|
||||||
p_decoder_dropout=0.1, # droput probability of second rnn layer in decoder
|
0.1, # droput probability of first rnn layer in decoder
|
||||||
|
p_decoder_dropout=
|
||||||
|
0.1, # droput probability of second rnn layer in decoder
|
||||||
p_postnet_dropout=0.5, # droput probability in decoder postnet
|
p_postnet_dropout=0.5, # droput probability in decoder postnet
|
||||||
guided_attention_loss_sigma=0.2,
|
guided_attention_loss_sigma=0.2,
|
||||||
d_global_condition=256,
|
d_global_condition=256,
|
||||||
|
@ -71,5 +75,3 @@ def get_cfg_defaults():
|
||||||
# Return a clone so that the defaults will not be altered
|
# Return a clone so that the defaults will not be altered
|
||||||
# This is for the "local variable" use pattern
|
# This is for the "local variable" use pattern
|
||||||
return _C.clone()
|
return _C.clone()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,17 +1,20 @@
|
||||||
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from parakeet.audio import AudioProcessor
|
from parakeet.audio import AudioProcessor
|
||||||
from parakeet.audio.spec_normalizer import LogMagnitude
|
from parakeet.audio.spec_normalizer import NormalizerBase, LogMagnitude
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
from functools import partial
|
from functools import partial
|
||||||
import tqdm
|
import tqdm
|
||||||
from yacs.config import CfgNode
|
|
||||||
|
|
||||||
def extract_mel(fname:Path, input_dir:Path, output_dir:Path, p, n):
|
from config import get_cfg_defaults
|
||||||
|
|
||||||
|
|
||||||
|
def extract_mel(fname: Path, input_dir: Path, output_dir: Path,
|
||||||
|
p: AudioProcessor, n: NormalizerBase):
|
||||||
relative_path = fname.relative_to(input_dir)
|
relative_path = fname.relative_to(input_dir)
|
||||||
out_path = (output_dir / relative_path).with_suffix(".npy")
|
out_path = (output_dir / relative_path).with_suffix(".npy")
|
||||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
# TODO: maybe we need to rescale the audio
|
|
||||||
wav = p.read_wav(fname)
|
wav = p.read_wav(fname)
|
||||||
mel = p.mel_spectrogram(wav)
|
mel = p.mel_spectrogram(wav)
|
||||||
mel = n.transform(mel)
|
mel = n.transform(mel)
|
||||||
|
@ -25,33 +28,54 @@ def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
|
p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
|
||||||
config.hop_length, config.n_mels, config.fmin,
|
config.hop_length, config.n_mels, config.fmin,
|
||||||
config.fmax)
|
config.fmax)
|
||||||
n = LogMagnitude(1e-5)
|
n = LogMagnitude(1e-5)
|
||||||
|
|
||||||
func = partial(extract_mel,
|
func = partial(extract_mel,
|
||||||
input_dir=input_dir,
|
input_dir=input_dir,
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
p=p,
|
p=p,
|
||||||
n=n)
|
n=n)
|
||||||
|
|
||||||
with mp.Pool(16) as pool:
|
with mp.Pool(16) as pool:
|
||||||
list(
|
list(
|
||||||
tqdm.tqdm(pool.imap(func, fnames),
|
tqdm.tqdm(pool.imap(func, fnames),
|
||||||
total=len(fnames),
|
total=len(fnames),
|
||||||
unit="utterance"))
|
unit="utterance"))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
audio_config = {
|
parser = argparse.ArgumentParser(
|
||||||
"sample_rate": 22050,
|
description=
|
||||||
"n_fft": 1024,
|
"Extract mel spectrogram from processed wav in AiShell3 training dataset."
|
||||||
"win_length": 1024,
|
)
|
||||||
"hop_length": 256,
|
parser.add_argument(
|
||||||
"n_mels": 80,
|
"--config",
|
||||||
"fmin": 0,
|
type=str,
|
||||||
"fmax": 8000}
|
help="yaml config file to overwrite the default config")
|
||||||
audio_config = CfgNode(audio_config)
|
parser.add_argument("--input",
|
||||||
extract_mel_multispeaker(audio_config, "~/datasets/aishell3/train/normalized_wav", "~/datasets/aishell3/train/mel")
|
type=str,
|
||||||
|
default="~/datasets/aishell3/train/normalized_wav",
|
||||||
|
help="path of the processed wav folder")
|
||||||
|
parser.add_argument("--output",
|
||||||
|
type=str,
|
||||||
|
default="~/datasets/aishell3/train/mel",
|
||||||
|
help="path of the folder to save mel spectrograms")
|
||||||
|
parser.add_argument(
|
||||||
|
"--opts",
|
||||||
|
nargs=argparse.REMAINDER,
|
||||||
|
help=
|
||||||
|
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||||
|
)
|
||||||
|
default_config = get_cfg_defaults()
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
if args.config:
|
||||||
|
default_config.merge_from_file(args.config)
|
||||||
|
if args.opts:
|
||||||
|
default_config.merge_from_list(args.opts)
|
||||||
|
default_config.freeze()
|
||||||
|
audio_config = default_config.data
|
||||||
|
|
||||||
|
extract_mel_multispeaker(audio_config, args.input, args.output)
|
||||||
|
|
|
@ -1,13 +1,10 @@
|
||||||
from paddle.io import Dataset
|
import argparse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
import tqdm
|
import tqdm
|
||||||
from parakeet.audio import AudioProcessor, LogMagnitude
|
|
||||||
import numpy as np
|
|
||||||
import multiprocessing as mp
|
|
||||||
from functools import partial
|
|
||||||
|
|
||||||
zh_pattern = re.compile("[\u4e00-\u9fa5]")
|
zh_pattern = re.compile("[\u4e00-\u9fa5]")
|
||||||
|
|
||||||
|
@ -16,21 +13,71 @@ _tones = {'<pad>', '<s>', '</s>', '0', '1', '2', '3', '4', '5'}
|
||||||
_pauses = {'%', '$'}
|
_pauses = {'%', '$'}
|
||||||
|
|
||||||
_initials = {
|
_initials = {
|
||||||
'b', 'p', 'm', 'f',
|
'b',
|
||||||
'd', 't', 'n', 'l',
|
'p',
|
||||||
'g', 'k', 'h',
|
'm',
|
||||||
'j', 'q', 'x',
|
'f',
|
||||||
'zh', 'ch', 'sh',
|
'd',
|
||||||
|
't',
|
||||||
|
'n',
|
||||||
|
'l',
|
||||||
|
'g',
|
||||||
|
'k',
|
||||||
|
'h',
|
||||||
|
'j',
|
||||||
|
'q',
|
||||||
|
'x',
|
||||||
|
'zh',
|
||||||
|
'ch',
|
||||||
|
'sh',
|
||||||
'r',
|
'r',
|
||||||
'z', 'c', 's',
|
'z',
|
||||||
|
'c',
|
||||||
|
's',
|
||||||
}
|
}
|
||||||
|
|
||||||
_finals = {
|
_finals = {
|
||||||
'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang', 'eng', 'er',
|
'ii',
|
||||||
'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian', 'ien', 'iang', 'ieng',
|
'iii',
|
||||||
'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang', 'ueng',
|
'a',
|
||||||
'v', 've', 'van', 'ven', 'veng',
|
'o',
|
||||||
}
|
'e',
|
||||||
|
'ea',
|
||||||
|
'ai',
|
||||||
|
'ei',
|
||||||
|
'ao',
|
||||||
|
'ou',
|
||||||
|
'an',
|
||||||
|
'en',
|
||||||
|
'ang',
|
||||||
|
'eng',
|
||||||
|
'er',
|
||||||
|
'i',
|
||||||
|
'ia',
|
||||||
|
'io',
|
||||||
|
'ie',
|
||||||
|
'iai',
|
||||||
|
'iao',
|
||||||
|
'iou',
|
||||||
|
'ian',
|
||||||
|
'ien',
|
||||||
|
'iang',
|
||||||
|
'ieng',
|
||||||
|
'u',
|
||||||
|
'ua',
|
||||||
|
'uo',
|
||||||
|
'uai',
|
||||||
|
'uei',
|
||||||
|
'uan',
|
||||||
|
'uen',
|
||||||
|
'uang',
|
||||||
|
'ueng',
|
||||||
|
'v',
|
||||||
|
've',
|
||||||
|
'van',
|
||||||
|
'ven',
|
||||||
|
'veng',
|
||||||
|
}
|
||||||
|
|
||||||
_ernized_symbol = {'&r'}
|
_ernized_symbol = {'&r'}
|
||||||
|
|
||||||
|
@ -38,6 +85,7 @@ _specials = {'<pad>', '<unk>', '<s>', '</s>'}
|
||||||
|
|
||||||
_phones = _initials | _finals | _ernized_symbol | _specials | _pauses
|
_phones = _initials | _finals | _ernized_symbol | _specials | _pauses
|
||||||
|
|
||||||
|
|
||||||
def is_zh(word):
|
def is_zh(word):
|
||||||
global zh_pattern
|
global zh_pattern
|
||||||
match = zh_pattern.search(word)
|
match = zh_pattern.search(word)
|
||||||
|
@ -47,6 +95,7 @@ def is_zh(word):
|
||||||
def ernized(syllable):
|
def ernized(syllable):
|
||||||
return syllable[:2] != "er" and syllable[-2] == 'r'
|
return syllable[:2] != "er" and syllable[-2] == 'r'
|
||||||
|
|
||||||
|
|
||||||
def convert(syllable):
|
def convert(syllable):
|
||||||
# expansion of o -> uo
|
# expansion of o -> uo
|
||||||
syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
|
syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
|
||||||
|
@ -56,15 +105,17 @@ def convert(syllable):
|
||||||
|
|
||||||
# expansion for ing, in
|
# expansion for ing, in
|
||||||
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
|
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
|
||||||
|
|
||||||
# expansion for un, ui, iu
|
# expansion for un, ui, iu
|
||||||
syllable = syllable.replace("un", "uen").replace("ui", "uei").replace("iu", "iou")
|
syllable = syllable.replace("un",
|
||||||
|
"uen").replace("ui",
|
||||||
|
"uei").replace("iu", "iou")
|
||||||
|
|
||||||
# rule for variants of i
|
# rule for variants of i
|
||||||
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
|
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
|
||||||
.replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\
|
.replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\
|
||||||
.replace("ri", "riii")
|
.replace("ri", "riii")
|
||||||
|
|
||||||
# rule for y preceding i, u
|
# rule for y preceding i, u
|
||||||
syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
|
syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
|
||||||
|
|
||||||
|
@ -72,11 +123,21 @@ def convert(syllable):
|
||||||
syllable = syllable.replace("wu", "u").replace("w", "u")
|
syllable = syllable.replace("wu", "u").replace("w", "u")
|
||||||
|
|
||||||
# rule for v following j, q, x
|
# rule for v following j, q, x
|
||||||
syllable = syllable.replace("ju", "jv").replace("qu", "qv").replace("xu", "xv")
|
syllable = syllable.replace("ju", "jv").replace("qu",
|
||||||
|
"qv").replace("xu", "xv")
|
||||||
|
|
||||||
return syllable
|
return syllable
|
||||||
|
|
||||||
def split_syllable(syllable:str):
|
|
||||||
|
def split_syllable(syllable: str):
|
||||||
|
"""Split a syllable in pinyin into a list of phones and a list of tones.
|
||||||
|
Initials have no tone, represented by '0', while finals have tones from
|
||||||
|
'1,2,3,4,5'.
|
||||||
|
|
||||||
|
e.g.
|
||||||
|
|
||||||
|
zhang -> ['zh', 'ang'], ['0', '1']
|
||||||
|
"""
|
||||||
if syllable in _pauses:
|
if syllable in _pauses:
|
||||||
# syllable, tone
|
# syllable, tone
|
||||||
return [syllable], ['0']
|
return [syllable], ['0']
|
||||||
|
@ -104,7 +165,7 @@ def split_syllable(syllable:str):
|
||||||
return phones, tones
|
return phones, tones
|
||||||
|
|
||||||
|
|
||||||
def load_aishell3_transcription(line:str):
|
def load_aishell3_transcription(line: str):
|
||||||
sentence_id, pinyin, text = line.strip().split("|")
|
sentence_id, pinyin, text = line.strip().split("|")
|
||||||
syllables = pinyin.strip().split()
|
syllables = pinyin.strip().split()
|
||||||
|
|
||||||
|
@ -118,7 +179,7 @@ def load_aishell3_transcription(line:str):
|
||||||
else:
|
else:
|
||||||
results.append(syllable[:-2] + syllable[-1])
|
results.append(syllable[:-2] + syllable[-1])
|
||||||
results.append('&r5')
|
results.append('&r5')
|
||||||
|
|
||||||
phones = []
|
phones = []
|
||||||
tones = []
|
tones = []
|
||||||
for syllable in results:
|
for syllable in results:
|
||||||
|
@ -127,7 +188,13 @@ def load_aishell3_transcription(line:str):
|
||||||
tones.extend(t)
|
tones.extend(t)
|
||||||
for p in phones:
|
for p in phones:
|
||||||
assert p in _phones, p
|
assert p in _phones, p
|
||||||
return {"sentence_id": sentence_id, "text": text, "syllables": results, "phones": phones, "tones": tones}
|
return {
|
||||||
|
"sentence_id": sentence_id,
|
||||||
|
"text": text,
|
||||||
|
"syllables": results,
|
||||||
|
"phones": phones,
|
||||||
|
"tones": tones
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def process_aishell3(dataset_root, output_dir):
|
def process_aishell3(dataset_root, output_dir):
|
||||||
|
@ -136,26 +203,46 @@ def process_aishell3(dataset_root, output_dir):
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
prosody_label_path = dataset_root / "label_train-set.txt"
|
prosody_label_path = dataset_root / "label_train-set.txt"
|
||||||
with open(prosody_label_path, 'rt') as f:
|
with open(prosody_label_path, 'rt') as f:
|
||||||
lines = [line.strip() for line in f]
|
lines = [line.strip() for line in f]
|
||||||
|
|
||||||
records = lines[5:]
|
records = lines[5:]
|
||||||
|
|
||||||
processed_records = []
|
processed_records = []
|
||||||
for record in tqdm.tqdm(records):
|
for record in tqdm.tqdm(records):
|
||||||
new_record = load_aishell3_transcription(record)
|
new_record = load_aishell3_transcription(record)
|
||||||
processed_records.append(new_record)
|
processed_records.append(new_record)
|
||||||
print(new_record)
|
print(new_record)
|
||||||
|
|
||||||
with open(output_dir / "metadata.pickle", 'wb') as f:
|
with open(output_dir / "metadata.pickle", 'wb') as f:
|
||||||
pickle.dump(processed_records, f)
|
pickle.dump(processed_records, f)
|
||||||
|
|
||||||
with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
|
|
||||||
yaml.safe_dump(processed_records, f, default_flow_style=None, allow_unicode=True)
|
|
||||||
|
|
||||||
print("metadata done!")
|
|
||||||
|
|
||||||
|
with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
|
||||||
|
yaml.safe_dump(processed_records,
|
||||||
|
f,
|
||||||
|
default_flow_style=None,
|
||||||
|
allow_unicode=True)
|
||||||
|
|
||||||
|
print("metadata done!")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
process_aishell3("~/datasets/aishell3/train", "~/datasets/aishell3/train")
|
parser = argparse.ArgumentParser(
|
||||||
|
description=
|
||||||
|
"Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--input",
|
||||||
|
type=str,
|
||||||
|
default="~/datasets/aishell3/train",
|
||||||
|
help="path of the training dataset,(contains a label_train-set.txt).")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
type=str,
|
||||||
|
help="the directory to save the processed transcription."
|
||||||
|
"If not provided, it would be the same as the input.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
if args.output is None:
|
||||||
|
args.output = args.input
|
||||||
|
|
||||||
|
process_aishell3(args.input, args.output)
|
||||||
|
|
|
@ -1,15 +1,17 @@
|
||||||
import librosa
|
|
||||||
import soundfile as sf
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
from tqdm import tqdm
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import librosa
|
||||||
|
import soundfile as sf
|
||||||
|
from tqdm import tqdm
|
||||||
from praatio import tgio
|
from praatio import tgio
|
||||||
|
|
||||||
|
|
||||||
def get_valid_part(fpath):
|
def get_valid_part(fpath):
|
||||||
f = tgio.openTextgrid(fpath)
|
f = tgio.openTextgrid(fpath)
|
||||||
|
|
||||||
start = 0
|
start = 0
|
||||||
phone_entry_list = f.tierDict['phones'].entryList
|
phone_entry_list = f.tierDict['phones'].entryList
|
||||||
first_entry = phone_entry_list[0]
|
first_entry = phone_entry_list[0]
|
||||||
|
@ -22,7 +24,7 @@ def get_valid_part(fpath):
|
||||||
else:
|
else:
|
||||||
end = last_entry.end
|
end = last_entry.end
|
||||||
return start, end
|
return start, end
|
||||||
|
|
||||||
|
|
||||||
def process_utterance(fpath, source_dir, target_dir, alignment_dir):
|
def process_utterance(fpath, source_dir, target_dir, alignment_dir):
|
||||||
rel_path = fpath.relative_to(source_dir)
|
rel_path = fpath.relative_to(source_dir)
|
||||||
|
@ -41,16 +43,20 @@ def preprocess_aishell3(source_dir, target_dir, alignment_dir):
|
||||||
source_dir = Path(source_dir).expanduser()
|
source_dir = Path(source_dir).expanduser()
|
||||||
target_dir = Path(target_dir).expanduser()
|
target_dir = Path(target_dir).expanduser()
|
||||||
alignment_dir = Path(alignment_dir).expanduser()
|
alignment_dir = Path(alignment_dir).expanduser()
|
||||||
|
|
||||||
wav_paths = list(source_dir.rglob("*.wav"))
|
wav_paths = list(source_dir.rglob("*.wav"))
|
||||||
print(f"there are {len(wav_paths)} audio files in total")
|
print(f"there are {len(wav_paths)} audio files in total")
|
||||||
fx = partial(process_utterance, source_dir=source_dir, target_dir=target_dir, alignment_dir=alignment_dir)
|
fx = partial(process_utterance,
|
||||||
|
source_dir=source_dir,
|
||||||
|
target_dir=target_dir,
|
||||||
|
alignment_dir=alignment_dir)
|
||||||
with Pool(16) as p:
|
with Pool(16) as p:
|
||||||
list(tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
|
list(
|
||||||
|
tqdm(p.imap(fx, wav_paths), total=len(wav_paths),
|
||||||
|
unit="utterance"))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
preprocess_aishell3("~/datasets/aishell3/train/wav", "~/datasets/aishell3/train/normalized_wav", "~/datasets/aishell3/train/alignment")
|
preprocess_aishell3("~/datasets/aishell3/train/wav",
|
||||||
|
"~/datasets/aishell3/train/normalized_wav",
|
||||||
|
"~/datasets/aishell3/train/alignment")
|
||||||
|
|
||||||
|
|
|
@ -12,19 +12,15 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import time
|
|
||||||
import sys
|
import sys
|
||||||
from collections import defaultdict
|
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
from paddle import distributed as dist
|
from paddle import distributed as dist
|
||||||
from paddle.io import DistributedBatchSampler
|
from paddle.io import DistributedBatchSampler
|
||||||
from visualdl import LogWriter
|
from visualdl import LogWriter
|
||||||
|
|
||||||
import parakeet
|
|
||||||
from parakeet.utils import checkpoint, mp_tools
|
from parakeet.utils import checkpoint, mp_tools
|
||||||
|
|
||||||
__all__ = ["ExperimentBase"]
|
__all__ = ["ExperimentBase"]
|
||||||
|
|
Loading…
Reference in New Issue