clean code for data processing

This commit is contained in:
iclementine 2021-04-22 17:20:34 +08:00
parent 56f2552201
commit e8a9a118bb
8 changed files with 226 additions and 104 deletions

View File

@ -1,49 +1,55 @@
import pickle
from pathlib import Path
import numpy as np
import librosa
from paddle.io import Dataset
import pickle
from parakeet.frontend import Vocab
from parakeet.data import batch_text_id, batch_spec
from preprocess_transcription import _phones, _tones
voc_phones = Vocab(sorted(list(_phones)))
print(voc_phones)
voc_tones = Vocab(sorted(list(_tones)))
print(voc_tones)
# use yaml to store preprocessed aishell3 dataset
voc_phones = Vocab(sorted(list(_phones)))
print("vocab_phones:\n", voc_phones)
voc_tones = Vocab(sorted(list(_tones)))
print("vocab+tones:\n", voc_tones)
class AiShell3(Dataset):
"""Processed AiShell3 dataset."""
def __init__(self, root):
super().__init__()
self.root = Path(root).expanduser()
self.embed_dir = self.root / "embed"
self.mel_dir = self.root / "mel"
with open (self.root / "metadata.pickle", 'rb') as f:
with open(self.root / "metadata.pickle", 'rb') as f:
self.records = pickle.load(f)
def __getitem__(self, index):
metadatum = self.records[index]
sentence_id = metadatum["sentence_id"]
speaker_id = sentence_id[:7]
phones = metadatum["phones"]
tones = metadatum["tones"]
phones = np.array([voc_phones.lookup(item) for item in phones], dtype=np.int64)
tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64)
phones = np.array([voc_phones.lookup(item) for item in phones],
dtype=np.int64)
tones = np.array([voc_tones.lookup(item) for item in tones],
dtype=np.int64)
mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
embed = np.load(str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
embed = np.load(
str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
return phones, tones, mel, embed
def __len__(self):
return len(self.records)
def collate_aishell3_examples(examples):
phones, tones, mel, embed = list(zip(*examples))
text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
T_dec = np.max(spec_lengths)
stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
stop_tokens = (np.arange(T_dec) >= np.expand_dims(spec_lengths,
-1)).astype(np.float32)
phones, _ = batch_text_id(phones)
tones, _ = batch_text_id(tones)
mel, _ = batch_spec(mel)
@ -53,13 +59,13 @@ def collate_aishell3_examples(examples):
# (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T)
return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens
if __name__ == "__main__":
dataset = AiShell3("~/datasets/aishell3/train")
example = dataset[0]
examples = [dataset[i] for i in range(10)]
batch = collate_aishell3_examples(examples)
for field in batch:
print(field.shape, field.dtype)

View File

@ -2,9 +2,9 @@ from chinese_text_to_pinyin import convert_to_pinyin
from chinese_phonology import convert, split_syllable
from typing import List, Tuple
def convert_sentence(text: str) -> List[Tuple[str]]:
syllables = convert_to_pinyin(text)
syllables = [item[0] for item in syllables]
phones = []
tones = []
for syllable in syllables:

View File

@ -1,4 +1,4 @@
from pypinyin import pinyin, Style
from pypinyin import lazy_pinyin, Style
from typing import List
@ -6,6 +6,7 @@ def convert_to_pinyin(text: str) -> List[str]:
"""convert text into list of syllables, other characters that are not chinese, thus
cannot be converted to pinyin are splited.
"""
syllables = pinyin(text, style=Style.TONE3, neutral_tone_with_five=True)
syllables = lazy_pinyin(text,
style=Style.TONE3,
neutral_tone_with_five=True)
return syllables

View File

@ -23,8 +23,8 @@ _C.data = CN(
n_fft=1024, # fft frame size
win_length=1024, # window size
hop_length=256, # hop size between ajacent frame
f_max=8000, # Hz, max frequency when converting to mel
f_min=0, # Hz, min frequency when converting to mel
fmax=8000, # Hz, max frequency when converting to mel
fmin=0, # Hz, min frequency when converting to mel
d_mels=80, # mel bands
padding_idx=0, # text embedding's padding index
))
@ -38,8 +38,10 @@ _C.model = CN(
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
d_prenet=256, # hidden size of decoder prenet
d_attention_rnn=1024, # hidden size of the first rnn layer in tacotron2 decoder
d_decoder_rnn=1024, # hidden size of the second rnn layer in tacotron2 decoder
d_attention_rnn=
1024, # hidden size of the first rnn layer in tacotron2 decoder
d_decoder_rnn=
1024, # hidden size of the second rnn layer in tacotron2 decoder
d_attention=128, # hidden size of decoder location linear layer
attention_filters=32, # number of filter in decoder location conv layer
attention_kernel_size=31, # kernel size of decoder location conv layer
@ -48,8 +50,10 @@ _C.model = CN(
postnet_conv_layers=5, # number of conv layer in decoder postnet
p_encoder_dropout=0.5, # droput probability in encoder
p_prenet_dropout=0.5, # droput probability in decoder prenet
p_attention_dropout=0.1, # droput probability of first rnn layer in decoder
p_decoder_dropout=0.1, # droput probability of second rnn layer in decoder
p_attention_dropout=
0.1, # droput probability of first rnn layer in decoder
p_decoder_dropout=
0.1, # droput probability of second rnn layer in decoder
p_postnet_dropout=0.5, # droput probability in decoder postnet
guided_attention_loss_sigma=0.2,
d_global_condition=256,
@ -71,5 +75,3 @@ def get_cfg_defaults():
# Return a clone so that the defaults will not be altered
# This is for the "local variable" use pattern
return _C.clone()

View File

@ -1,17 +1,20 @@
import argparse
import numpy as np
from pathlib import Path
from parakeet.audio import AudioProcessor
from parakeet.audio.spec_normalizer import LogMagnitude
from parakeet.audio.spec_normalizer import NormalizerBase, LogMagnitude
import multiprocessing as mp
from functools import partial
import tqdm
from yacs.config import CfgNode
def extract_mel(fname:Path, input_dir:Path, output_dir:Path, p, n):
from config import get_cfg_defaults
def extract_mel(fname: Path, input_dir: Path, output_dir: Path,
p: AudioProcessor, n: NormalizerBase):
relative_path = fname.relative_to(input_dir)
out_path = (output_dir / relative_path).with_suffix(".npy")
out_path.parent.mkdir(parents=True, exist_ok=True)
# TODO: maybe we need to rescale the audio
wav = p.read_wav(fname)
mel = p.mel_spectrogram(wav)
mel = n.transform(mel)
@ -25,33 +28,54 @@ def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
output_dir.mkdir(parents=True, exist_ok=True)
p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
config.hop_length, config.n_mels, config.fmin,
config.fmax)
config.hop_length, config.n_mels, config.fmin,
config.fmax)
n = LogMagnitude(1e-5)
func = partial(extract_mel,
input_dir=input_dir,
output_dir=output_dir,
p=p,
n=n)
input_dir=input_dir,
output_dir=output_dir,
p=p,
n=n)
with mp.Pool(16) as pool:
list(
tqdm.tqdm(pool.imap(func, fnames),
total=len(fnames),
unit="utterance"))
total=len(fnames),
unit="utterance"))
if __name__ == "__main__":
audio_config = {
"sample_rate": 22050,
"n_fft": 1024,
"win_length": 1024,
"hop_length": 256,
"n_mels": 80,
"fmin": 0,
"fmax": 8000}
audio_config = CfgNode(audio_config)
extract_mel_multispeaker(audio_config, "~/datasets/aishell3/train/normalized_wav", "~/datasets/aishell3/train/mel")
parser = argparse.ArgumentParser(
description=
"Extract mel spectrogram from processed wav in AiShell3 training dataset."
)
parser.add_argument(
"--config",
type=str,
help="yaml config file to overwrite the default config")
parser.add_argument("--input",
type=str,
default="~/datasets/aishell3/train/normalized_wav",
help="path of the processed wav folder")
parser.add_argument("--output",
type=str,
default="~/datasets/aishell3/train/mel",
help="path of the folder to save mel spectrograms")
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help=
"options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
default_config = get_cfg_defaults()
args = parser.parse_args()
if args.config:
default_config.merge_from_file(args.config)
if args.opts:
default_config.merge_from_list(args.opts)
default_config.freeze()
audio_config = default_config.data
extract_mel_multispeaker(audio_config, args.input, args.output)

View File

@ -1,13 +1,10 @@
from paddle.io import Dataset
import argparse
from pathlib import Path
import re
import pickle
import yaml
import tqdm
from parakeet.audio import AudioProcessor, LogMagnitude
import numpy as np
import multiprocessing as mp
from functools import partial
zh_pattern = re.compile("[\u4e00-\u9fa5]")
@ -16,21 +13,71 @@ _tones = {'<pad>', '<s>', '</s>', '0', '1', '2', '3', '4', '5'}
_pauses = {'%', '$'}
_initials = {
'b', 'p', 'm', 'f',
'd', 't', 'n', 'l',
'g', 'k', 'h',
'j', 'q', 'x',
'zh', 'ch', 'sh',
'b',
'p',
'm',
'f',
'd',
't',
'n',
'l',
'g',
'k',
'h',
'j',
'q',
'x',
'zh',
'ch',
'sh',
'r',
'z', 'c', 's',
'z',
'c',
's',
}
_finals = {
'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang', 'eng', 'er',
'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian', 'ien', 'iang', 'ieng',
'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang', 'ueng',
'v', 've', 'van', 'ven', 'veng',
}
'ii',
'iii',
'a',
'o',
'e',
'ea',
'ai',
'ei',
'ao',
'ou',
'an',
'en',
'ang',
'eng',
'er',
'i',
'ia',
'io',
'ie',
'iai',
'iao',
'iou',
'ian',
'ien',
'iang',
'ieng',
'u',
'ua',
'uo',
'uai',
'uei',
'uan',
'uen',
'uang',
'ueng',
'v',
've',
'van',
'ven',
'veng',
}
_ernized_symbol = {'&r'}
@ -38,6 +85,7 @@ _specials = {'<pad>', '<unk>', '<s>', '</s>'}
_phones = _initials | _finals | _ernized_symbol | _specials | _pauses
def is_zh(word):
global zh_pattern
match = zh_pattern.search(word)
@ -47,6 +95,7 @@ def is_zh(word):
def ernized(syllable):
return syllable[:2] != "er" and syllable[-2] == 'r'
def convert(syllable):
# expansion of o -> uo
syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
@ -56,15 +105,17 @@ def convert(syllable):
# expansion for ing, in
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
# expansion for un, ui, iu
syllable = syllable.replace("un", "uen").replace("ui", "uei").replace("iu", "iou")
syllable = syllable.replace("un",
"uen").replace("ui",
"uei").replace("iu", "iou")
# rule for variants of i
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
.replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\
.replace("ri", "riii")
# rule for y preceding i, u
syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
@ -72,11 +123,21 @@ def convert(syllable):
syllable = syllable.replace("wu", "u").replace("w", "u")
# rule for v following j, q, x
syllable = syllable.replace("ju", "jv").replace("qu", "qv").replace("xu", "xv")
syllable = syllable.replace("ju", "jv").replace("qu",
"qv").replace("xu", "xv")
return syllable
def split_syllable(syllable:str):
def split_syllable(syllable: str):
"""Split a syllable in pinyin into a list of phones and a list of tones.
Initials have no tone, represented by '0', while finals have tones from
'1,2,3,4,5'.
e.g.
zhang -> ['zh', 'ang'], ['0', '1']
"""
if syllable in _pauses:
# syllable, tone
return [syllable], ['0']
@ -104,7 +165,7 @@ def split_syllable(syllable:str):
return phones, tones
def load_aishell3_transcription(line:str):
def load_aishell3_transcription(line: str):
sentence_id, pinyin, text = line.strip().split("|")
syllables = pinyin.strip().split()
@ -118,7 +179,7 @@ def load_aishell3_transcription(line:str):
else:
results.append(syllable[:-2] + syllable[-1])
results.append('&r5')
phones = []
tones = []
for syllable in results:
@ -127,7 +188,13 @@ def load_aishell3_transcription(line:str):
tones.extend(t)
for p in phones:
assert p in _phones, p
return {"sentence_id": sentence_id, "text": text, "syllables": results, "phones": phones, "tones": tones}
return {
"sentence_id": sentence_id,
"text": text,
"syllables": results,
"phones": phones,
"tones": tones
}
def process_aishell3(dataset_root, output_dir):
@ -136,26 +203,46 @@ def process_aishell3(dataset_root, output_dir):
output_dir.mkdir(parents=True, exist_ok=True)
prosody_label_path = dataset_root / "label_train-set.txt"
with open(prosody_label_path, 'rt') as f:
with open(prosody_label_path, 'rt') as f:
lines = [line.strip() for line in f]
records = lines[5:]
processed_records = []
for record in tqdm.tqdm(records):
new_record = load_aishell3_transcription(record)
processed_records.append(new_record)
print(new_record)
with open(output_dir / "metadata.pickle", 'wb') as f:
with open(output_dir / "metadata.pickle", 'wb') as f:
pickle.dump(processed_records, f)
with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
yaml.safe_dump(processed_records, f, default_flow_style=None, allow_unicode=True)
print("metadata done!")
with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
yaml.safe_dump(processed_records,
f,
default_flow_style=None,
allow_unicode=True)
print("metadata done!")
if __name__ == "__main__":
process_aishell3("~/datasets/aishell3/train", "~/datasets/aishell3/train")
parser = argparse.ArgumentParser(
description=
"Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
)
parser.add_argument(
"--input",
type=str,
default="~/datasets/aishell3/train",
help="path of the training dataset,(contains a label_train-set.txt).")
parser.add_argument(
"--output",
type=str,
help="the directory to save the processed transcription."
"If not provided, it would be the same as the input.")
args = parser.parse_args()
if args.output is None:
args.output = args.input
process_aishell3(args.input, args.output)

View File

@ -1,15 +1,17 @@
import librosa
import soundfile as sf
from pathlib import Path
from multiprocessing import Pool
from tqdm import tqdm
from functools import partial
import numpy as np
import librosa
import soundfile as sf
from tqdm import tqdm
from praatio import tgio
def get_valid_part(fpath):
f = tgio.openTextgrid(fpath)
start = 0
phone_entry_list = f.tierDict['phones'].entryList
first_entry = phone_entry_list[0]
@ -22,7 +24,7 @@ def get_valid_part(fpath):
else:
end = last_entry.end
return start, end
def process_utterance(fpath, source_dir, target_dir, alignment_dir):
rel_path = fpath.relative_to(source_dir)
@ -41,16 +43,20 @@ def preprocess_aishell3(source_dir, target_dir, alignment_dir):
source_dir = Path(source_dir).expanduser()
target_dir = Path(target_dir).expanduser()
alignment_dir = Path(alignment_dir).expanduser()
wav_paths = list(source_dir.rglob("*.wav"))
print(f"there are {len(wav_paths)} audio files in total")
fx = partial(process_utterance, source_dir=source_dir, target_dir=target_dir, alignment_dir=alignment_dir)
fx = partial(process_utterance,
source_dir=source_dir,
target_dir=target_dir,
alignment_dir=alignment_dir)
with Pool(16) as p:
list(tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
list(
tqdm(p.imap(fx, wav_paths), total=len(wav_paths),
unit="utterance"))
if __name__ == "__main__":
preprocess_aishell3("~/datasets/aishell3/train/wav", "~/datasets/aishell3/train/normalized_wav", "~/datasets/aishell3/train/alignment")
preprocess_aishell3("~/datasets/aishell3/train/wav",
"~/datasets/aishell3/train/normalized_wav",
"~/datasets/aishell3/train/alignment")

View File

@ -12,19 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import sys
from collections import defaultdict
import logging
from pathlib import Path
import numpy as np
import paddle
from paddle import distributed as dist
from paddle.io import DistributedBatchSampler
from visualdl import LogWriter
import parakeet
from parakeet.utils import checkpoint, mp_tools
__all__ = ["ExperimentBase"]