clean code for data processing

This commit is contained in:
iclementine 2021-04-22 17:20:34 +08:00
parent 56f2552201
commit e8a9a118bb
8 changed files with 226 additions and 104 deletions

View File

@ -1,49 +1,55 @@
import pickle
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import librosa
from paddle.io import Dataset from paddle.io import Dataset
import pickle
from parakeet.frontend import Vocab from parakeet.frontend import Vocab
from parakeet.data import batch_text_id, batch_spec from parakeet.data import batch_text_id, batch_spec
from preprocess_transcription import _phones, _tones from preprocess_transcription import _phones, _tones
voc_phones = Vocab(sorted(list(_phones)))
print(voc_phones)
voc_tones = Vocab(sorted(list(_tones)))
print(voc_tones)
# use yaml to store preprocessed aishell3 dataset voc_phones = Vocab(sorted(list(_phones)))
print("vocab_phones:\n", voc_phones)
voc_tones = Vocab(sorted(list(_tones)))
print("vocab+tones:\n", voc_tones)
class AiShell3(Dataset): class AiShell3(Dataset):
"""Processed AiShell3 dataset."""
def __init__(self, root): def __init__(self, root):
super().__init__()
self.root = Path(root).expanduser() self.root = Path(root).expanduser()
self.embed_dir = self.root / "embed" self.embed_dir = self.root / "embed"
self.mel_dir = self.root / "mel" self.mel_dir = self.root / "mel"
with open (self.root / "metadata.pickle", 'rb') as f: with open(self.root / "metadata.pickle", 'rb') as f:
self.records = pickle.load(f) self.records = pickle.load(f)
def __getitem__(self, index): def __getitem__(self, index):
metadatum = self.records[index] metadatum = self.records[index]
sentence_id = metadatum["sentence_id"] sentence_id = metadatum["sentence_id"]
speaker_id = sentence_id[:7] speaker_id = sentence_id[:7]
phones = metadatum["phones"] phones = metadatum["phones"]
tones = metadatum["tones"] tones = metadatum["tones"]
phones = np.array([voc_phones.lookup(item) for item in phones], dtype=np.int64) phones = np.array([voc_phones.lookup(item) for item in phones],
tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64) dtype=np.int64)
tones = np.array([voc_tones.lookup(item) for item in tones],
dtype=np.int64)
mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy"))) mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
embed = np.load(str(self.embed_dir / speaker_id / (sentence_id + ".npy"))) embed = np.load(
str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
return phones, tones, mel, embed return phones, tones, mel, embed
def __len__(self): def __len__(self):
return len(self.records) return len(self.records)
def collate_aishell3_examples(examples): def collate_aishell3_examples(examples):
phones, tones, mel, embed = list(zip(*examples)) phones, tones, mel, embed = list(zip(*examples))
text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64) text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64) spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
T_dec = np.max(spec_lengths) stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths,
stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32) -1)).astype(np.float32)
phones, _ = batch_text_id(phones) phones, _ = batch_text_id(phones)
tones, _ = batch_text_id(tones) tones, _ = batch_text_id(tones)
mel, _ = batch_spec(mel) mel, _ = batch_spec(mel)
@ -53,13 +59,13 @@ def collate_aishell3_examples(examples):
# (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T) # (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T)
return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens
if __name__ == "__main__": if __name__ == "__main__":
dataset = AiShell3("~/datasets/aishell3/train") dataset = AiShell3("~/datasets/aishell3/train")
example = dataset[0] example = dataset[0]
examples = [dataset[i] for i in range(10)] examples = [dataset[i] for i in range(10)]
batch = collate_aishell3_examples(examples) batch = collate_aishell3_examples(examples)
for field in batch: for field in batch:
print(field.shape, field.dtype) print(field.shape, field.dtype)

View File

@ -2,9 +2,9 @@ from chinese_text_to_pinyin import convert_to_pinyin
from chinese_phonology import convert, split_syllable from chinese_phonology import convert, split_syllable
from typing import List, Tuple from typing import List, Tuple
def convert_sentence(text: str) -> List[Tuple[str]]: def convert_sentence(text: str) -> List[Tuple[str]]:
syllables = convert_to_pinyin(text) syllables = convert_to_pinyin(text)
syllables = [item[0] for item in syllables]
phones = [] phones = []
tones = [] tones = []
for syllable in syllables: for syllable in syllables:

View File

@ -1,4 +1,4 @@
from pypinyin import pinyin, Style from pypinyin import lazy_pinyin, Style
from typing import List from typing import List
@ -6,6 +6,7 @@ def convert_to_pinyin(text: str) -> List[str]:
"""convert text into list of syllables, other characters that are not chinese, thus """convert text into list of syllables, other characters that are not chinese, thus
cannot be converted to pinyin are splited. cannot be converted to pinyin are splited.
""" """
syllables = pinyin(text, style=Style.TONE3, neutral_tone_with_five=True) syllables = lazy_pinyin(text,
style=Style.TONE3,
neutral_tone_with_five=True)
return syllables return syllables

View File

@ -23,8 +23,8 @@ _C.data = CN(
n_fft=1024, # fft frame size n_fft=1024, # fft frame size
win_length=1024, # window size win_length=1024, # window size
hop_length=256, # hop size between ajacent frame hop_length=256, # hop size between ajacent frame
f_max=8000, # Hz, max frequency when converting to mel fmax=8000, # Hz, max frequency when converting to mel
f_min=0, # Hz, min frequency when converting to mel fmin=0, # Hz, min frequency when converting to mel
d_mels=80, # mel bands d_mels=80, # mel bands
padding_idx=0, # text embedding's padding index padding_idx=0, # text embedding's padding index
)) ))
@ -38,8 +38,10 @@ _C.model = CN(
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
d_prenet=256, # hidden size of decoder prenet d_prenet=256, # hidden size of decoder prenet
d_attention_rnn=1024, # hidden size of the first rnn layer in tacotron2 decoder d_attention_rnn=
d_decoder_rnn=1024, # hidden size of the second rnn layer in tacotron2 decoder 1024, # hidden size of the first rnn layer in tacotron2 decoder
d_decoder_rnn=
1024, # hidden size of the second rnn layer in tacotron2 decoder
d_attention=128, # hidden size of decoder location linear layer d_attention=128, # hidden size of decoder location linear layer
attention_filters=32, # number of filter in decoder location conv layer attention_filters=32, # number of filter in decoder location conv layer
attention_kernel_size=31, # kernel size of decoder location conv layer attention_kernel_size=31, # kernel size of decoder location conv layer
@ -48,8 +50,10 @@ _C.model = CN(
postnet_conv_layers=5, # number of conv layer in decoder postnet postnet_conv_layers=5, # number of conv layer in decoder postnet
p_encoder_dropout=0.5, # droput probability in encoder p_encoder_dropout=0.5, # droput probability in encoder
p_prenet_dropout=0.5, # droput probability in decoder prenet p_prenet_dropout=0.5, # droput probability in decoder prenet
p_attention_dropout=0.1, # droput probability of first rnn layer in decoder p_attention_dropout=
p_decoder_dropout=0.1, # droput probability of second rnn layer in decoder 0.1, # droput probability of first rnn layer in decoder
p_decoder_dropout=
0.1, # droput probability of second rnn layer in decoder
p_postnet_dropout=0.5, # droput probability in decoder postnet p_postnet_dropout=0.5, # droput probability in decoder postnet
guided_attention_loss_sigma=0.2, guided_attention_loss_sigma=0.2,
d_global_condition=256, d_global_condition=256,
@ -71,5 +75,3 @@ def get_cfg_defaults():
# Return a clone so that the defaults will not be altered # Return a clone so that the defaults will not be altered
# This is for the "local variable" use pattern # This is for the "local variable" use pattern
return _C.clone() return _C.clone()

View File

@ -1,17 +1,20 @@
import argparse
import numpy as np import numpy as np
from pathlib import Path from pathlib import Path
from parakeet.audio import AudioProcessor from parakeet.audio import AudioProcessor
from parakeet.audio.spec_normalizer import LogMagnitude from parakeet.audio.spec_normalizer import NormalizerBase, LogMagnitude
import multiprocessing as mp import multiprocessing as mp
from functools import partial from functools import partial
import tqdm import tqdm
from yacs.config import CfgNode
def extract_mel(fname:Path, input_dir:Path, output_dir:Path, p, n): from config import get_cfg_defaults
def extract_mel(fname: Path, input_dir: Path, output_dir: Path,
p: AudioProcessor, n: NormalizerBase):
relative_path = fname.relative_to(input_dir) relative_path = fname.relative_to(input_dir)
out_path = (output_dir / relative_path).with_suffix(".npy") out_path = (output_dir / relative_path).with_suffix(".npy")
out_path.parent.mkdir(parents=True, exist_ok=True) out_path.parent.mkdir(parents=True, exist_ok=True)
# TODO: maybe we need to rescale the audio
wav = p.read_wav(fname) wav = p.read_wav(fname)
mel = p.mel_spectrogram(wav) mel = p.mel_spectrogram(wav)
mel = n.transform(mel) mel = n.transform(mel)
@ -25,33 +28,54 @@ def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length, p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
config.hop_length, config.n_mels, config.fmin, config.hop_length, config.n_mels, config.fmin,
config.fmax) config.fmax)
n = LogMagnitude(1e-5) n = LogMagnitude(1e-5)
func = partial(extract_mel, func = partial(extract_mel,
input_dir=input_dir, input_dir=input_dir,
output_dir=output_dir, output_dir=output_dir,
p=p, p=p,
n=n) n=n)
with mp.Pool(16) as pool: with mp.Pool(16) as pool:
list( list(
tqdm.tqdm(pool.imap(func, fnames), tqdm.tqdm(pool.imap(func, fnames),
total=len(fnames), total=len(fnames),
unit="utterance")) unit="utterance"))
if __name__ == "__main__": if __name__ == "__main__":
audio_config = { parser = argparse.ArgumentParser(
"sample_rate": 22050, description=
"n_fft": 1024, "Extract mel spectrogram from processed wav in AiShell3 training dataset."
"win_length": 1024, )
"hop_length": 256, parser.add_argument(
"n_mels": 80, "--config",
"fmin": 0, type=str,
"fmax": 8000} help="yaml config file to overwrite the default config")
audio_config = CfgNode(audio_config) parser.add_argument("--input",
extract_mel_multispeaker(audio_config, "~/datasets/aishell3/train/normalized_wav", "~/datasets/aishell3/train/mel") type=str,
default="~/datasets/aishell3/train/normalized_wav",
help="path of the processed wav folder")
parser.add_argument("--output",
type=str,
default="~/datasets/aishell3/train/mel",
help="path of the folder to save mel spectrograms")
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help=
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
default_config = get_cfg_defaults()
args = parser.parse_args()
if args.config:
default_config.merge_from_file(args.config)
if args.opts:
default_config.merge_from_list(args.opts)
default_config.freeze()
audio_config = default_config.data
extract_mel_multispeaker(audio_config, args.input, args.output)

View File

@ -1,13 +1,10 @@
from paddle.io import Dataset import argparse
from pathlib import Path from pathlib import Path
import re import re
import pickle import pickle
import yaml import yaml
import tqdm import tqdm
from parakeet.audio import AudioProcessor, LogMagnitude
import numpy as np
import multiprocessing as mp
from functools import partial
zh_pattern = re.compile("[\u4e00-\u9fa5]") zh_pattern = re.compile("[\u4e00-\u9fa5]")
@ -16,21 +13,71 @@ _tones = {'<pad>', '<s>', '</s>', '0', '1', '2', '3', '4', '5'}
_pauses = {'%', '$'} _pauses = {'%', '$'}
_initials = { _initials = {
'b', 'p', 'm', 'f', 'b',
'd', 't', 'n', 'l', 'p',
'g', 'k', 'h', 'm',
'j', 'q', 'x', 'f',
'zh', 'ch', 'sh', 'd',
't',
'n',
'l',
'g',
'k',
'h',
'j',
'q',
'x',
'zh',
'ch',
'sh',
'r', 'r',
'z', 'c', 's', 'z',
'c',
's',
} }
_finals = { _finals = {
'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang', 'eng', 'er', 'ii',
'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian', 'ien', 'iang', 'ieng', 'iii',
'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang', 'ueng', 'a',
'v', 've', 'van', 'ven', 'veng', 'o',
} 'e',
'ea',
'ai',
'ei',
'ao',
'ou',
'an',
'en',
'ang',
'eng',
'er',
'i',
'ia',
'io',
'ie',
'iai',
'iao',
'iou',
'ian',
'ien',
'iang',
'ieng',
'u',
'ua',
'uo',
'uai',
'uei',
'uan',
'uen',
'uang',
'ueng',
'v',
've',
'van',
'ven',
'veng',
}
_ernized_symbol = {'&r'} _ernized_symbol = {'&r'}
@ -38,6 +85,7 @@ _specials = {'<pad>', '<unk>', '<s>', '</s>'}
_phones = _initials | _finals | _ernized_symbol | _specials | _pauses _phones = _initials | _finals | _ernized_symbol | _specials | _pauses
def is_zh(word): def is_zh(word):
global zh_pattern global zh_pattern
match = zh_pattern.search(word) match = zh_pattern.search(word)
@ -47,6 +95,7 @@ def is_zh(word):
def ernized(syllable): def ernized(syllable):
return syllable[:2] != "er" and syllable[-2] == 'r' return syllable[:2] != "er" and syllable[-2] == 'r'
def convert(syllable): def convert(syllable):
# expansion of o -> uo # expansion of o -> uo
syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable) syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
@ -56,15 +105,17 @@ def convert(syllable):
# expansion for ing, in # expansion for ing, in
syllable = syllable.replace("ing", "ieng").replace("in", "ien") syllable = syllable.replace("ing", "ieng").replace("in", "ien")
# expansion for un, ui, iu # expansion for un, ui, iu
syllable = syllable.replace("un", "uen").replace("ui", "uei").replace("iu", "iou") syllable = syllable.replace("un",
"uen").replace("ui",
"uei").replace("iu", "iou")
# rule for variants of i # rule for variants of i
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\ syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
.replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\ .replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\
.replace("ri", "riii") .replace("ri", "riii")
# rule for y preceding i, u # rule for y preceding i, u
syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i") syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
@ -72,11 +123,21 @@ def convert(syllable):
syllable = syllable.replace("wu", "u").replace("w", "u") syllable = syllable.replace("wu", "u").replace("w", "u")
# rule for v following j, q, x # rule for v following j, q, x
syllable = syllable.replace("ju", "jv").replace("qu", "qv").replace("xu", "xv") syllable = syllable.replace("ju", "jv").replace("qu",
"qv").replace("xu", "xv")
return syllable return syllable
def split_syllable(syllable:str):
def split_syllable(syllable: str):
"""Split a syllable in pinyin into a list of phones and a list of tones.
Initials have no tone, represented by '0', while finals have tones from
'1,2,3,4,5'.
e.g.
zhang -> ['zh', 'ang'], ['0', '1']
"""
if syllable in _pauses: if syllable in _pauses:
# syllable, tone # syllable, tone
return [syllable], ['0'] return [syllable], ['0']
@ -104,7 +165,7 @@ def split_syllable(syllable:str):
return phones, tones return phones, tones
def load_aishell3_transcription(line:str): def load_aishell3_transcription(line: str):
sentence_id, pinyin, text = line.strip().split("|") sentence_id, pinyin, text = line.strip().split("|")
syllables = pinyin.strip().split() syllables = pinyin.strip().split()
@ -118,7 +179,7 @@ def load_aishell3_transcription(line:str):
else: else:
results.append(syllable[:-2] + syllable[-1]) results.append(syllable[:-2] + syllable[-1])
results.append('&r5') results.append('&r5')
phones = [] phones = []
tones = [] tones = []
for syllable in results: for syllable in results:
@ -127,7 +188,13 @@ def load_aishell3_transcription(line:str):
tones.extend(t) tones.extend(t)
for p in phones: for p in phones:
assert p in _phones, p assert p in _phones, p
return {"sentence_id": sentence_id, "text": text, "syllables": results, "phones": phones, "tones": tones} return {
"sentence_id": sentence_id,
"text": text,
"syllables": results,
"phones": phones,
"tones": tones
}
def process_aishell3(dataset_root, output_dir): def process_aishell3(dataset_root, output_dir):
@ -136,26 +203,46 @@ def process_aishell3(dataset_root, output_dir):
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
prosody_label_path = dataset_root / "label_train-set.txt" prosody_label_path = dataset_root / "label_train-set.txt"
with open(prosody_label_path, 'rt') as f: with open(prosody_label_path, 'rt') as f:
lines = [line.strip() for line in f] lines = [line.strip() for line in f]
records = lines[5:] records = lines[5:]
processed_records = [] processed_records = []
for record in tqdm.tqdm(records): for record in tqdm.tqdm(records):
new_record = load_aishell3_transcription(record) new_record = load_aishell3_transcription(record)
processed_records.append(new_record) processed_records.append(new_record)
print(new_record) print(new_record)
with open(output_dir / "metadata.pickle", 'wb') as f: with open(output_dir / "metadata.pickle", 'wb') as f:
pickle.dump(processed_records, f) pickle.dump(processed_records, f)
with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
yaml.safe_dump(processed_records, f, default_flow_style=None, allow_unicode=True)
print("metadata done!")
with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
yaml.safe_dump(processed_records,
f,
default_flow_style=None,
allow_unicode=True)
print("metadata done!")
if __name__ == "__main__": if __name__ == "__main__":
process_aishell3("~/datasets/aishell3/train", "~/datasets/aishell3/train") parser = argparse.ArgumentParser(
description=
"Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
)
parser.add_argument(
"--input",
type=str,
default="~/datasets/aishell3/train",
help="path of the training dataset,(contains a label_train-set.txt).")
parser.add_argument(
"--output",
type=str,
help="the directory to save the processed transcription."
"If not provided, it would be the same as the input.")
args = parser.parse_args()
if args.output is None:
args.output = args.input
process_aishell3(args.input, args.output)

View File

@ -1,15 +1,17 @@
import librosa
import soundfile as sf
from pathlib import Path from pathlib import Path
from multiprocessing import Pool from multiprocessing import Pool
from tqdm import tqdm
from functools import partial from functools import partial
import numpy as np import numpy as np
import librosa
import soundfile as sf
from tqdm import tqdm
from praatio import tgio from praatio import tgio
def get_valid_part(fpath): def get_valid_part(fpath):
f = tgio.openTextgrid(fpath) f = tgio.openTextgrid(fpath)
start = 0 start = 0
phone_entry_list = f.tierDict['phones'].entryList phone_entry_list = f.tierDict['phones'].entryList
first_entry = phone_entry_list[0] first_entry = phone_entry_list[0]
@ -22,7 +24,7 @@ def get_valid_part(fpath):
else: else:
end = last_entry.end end = last_entry.end
return start, end return start, end
def process_utterance(fpath, source_dir, target_dir, alignment_dir): def process_utterance(fpath, source_dir, target_dir, alignment_dir):
rel_path = fpath.relative_to(source_dir) rel_path = fpath.relative_to(source_dir)
@ -41,16 +43,20 @@ def preprocess_aishell3(source_dir, target_dir, alignment_dir):
source_dir = Path(source_dir).expanduser() source_dir = Path(source_dir).expanduser()
target_dir = Path(target_dir).expanduser() target_dir = Path(target_dir).expanduser()
alignment_dir = Path(alignment_dir).expanduser() alignment_dir = Path(alignment_dir).expanduser()
wav_paths = list(source_dir.rglob("*.wav")) wav_paths = list(source_dir.rglob("*.wav"))
print(f"there are {len(wav_paths)} audio files in total") print(f"there are {len(wav_paths)} audio files in total")
fx = partial(process_utterance, source_dir=source_dir, target_dir=target_dir, alignment_dir=alignment_dir) fx = partial(process_utterance,
source_dir=source_dir,
target_dir=target_dir,
alignment_dir=alignment_dir)
with Pool(16) as p: with Pool(16) as p:
list(tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance")) list(
tqdm(p.imap(fx, wav_paths), total=len(wav_paths),
unit="utterance"))
if __name__ == "__main__": if __name__ == "__main__":
preprocess_aishell3("~/datasets/aishell3/train/wav", "~/datasets/aishell3/train/normalized_wav", "~/datasets/aishell3/train/alignment") preprocess_aishell3("~/datasets/aishell3/train/wav",
"~/datasets/aishell3/train/normalized_wav",
"~/datasets/aishell3/train/alignment")

View File

@ -12,19 +12,15 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import time
import sys import sys
from collections import defaultdict
import logging import logging
from pathlib import Path from pathlib import Path
import numpy as np
import paddle import paddle
from paddle import distributed as dist from paddle import distributed as dist
from paddle.io import DistributedBatchSampler from paddle.io import DistributedBatchSampler
from visualdl import LogWriter from visualdl import LogWriter
import parakeet
from parakeet.utils import checkpoint, mp_tools from parakeet.utils import checkpoint, mp_tools
__all__ = ["ExperimentBase"] __all__ = ["ExperimentBase"]