fix speaker encoder and add support for 2 more datasets

This commit is contained in:
chenfeiyu 2021-03-30 14:38:44 +08:00
parent 59ed247840
commit b5dd0cc197
3 changed files with 30 additions and 4 deletions

View File

@ -137,3 +137,26 @@ def process_voxceleb2(processor,
speaker_dirs = list((dataset_root / "wav").glob("*")) speaker_dirs = list((dataset_root / "wav").glob("*"))
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name, _process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
output_dir, "*.wav", skip_existing) output_dir, "*.wav", skip_existing)
def process_aidatatang_200zh(processor,
datasets_root,
output_dir,
skip_existing=False):
dataset_name = "aidatatang_200zh/train"
dataset_root = datasets_root / dataset_name
speaker_dirs = list((dataset_root).glob("*"))
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
output_dir, "*.wav", skip_existing)
def process_magicdata(processor,
datasets_root,
output_dir,
skip_existing=False):
dataset_name = "magicdata/train"
dataset_root = datasets_root / dataset_name
speaker_dirs = list((dataset_root).glob("*"))
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
output_dir, "*.wav", skip_existing)

View File

@ -2,7 +2,7 @@ import argparse
from pathlib import Path from pathlib import Path
from config import get_cfg_defaults from config import get_cfg_defaults
from audio_processor import SpeakerVerificationPreprocessor from audio_processor import SpeakerVerificationPreprocessor
from dataset_processors import process_librispeech, process_voxceleb1, process_voxceleb2 from dataset_processors import process_librispeech, process_voxceleb1, process_voxceleb2, process_aidatatang_200zh, process_magicdata
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
@ -23,7 +23,7 @@ if __name__ == "__main__":
help= help=
"comma-separated list of names of the datasets you want to preprocess. only " "comma-separated list of names of the datasets you want to preprocess. only "
"the train set of these datastes will be used. Possible names: librispeech_other, " "the train set of these datastes will be used. Possible names: librispeech_other, "
"voxceleb1, voxceleb2.") "voxceleb1, voxceleb2, aidatatang_200zh, magicdata.")
parser.add_argument( parser.add_argument(
"--skip_existing", "--skip_existing",
action="store_true", action="store_true",
@ -79,6 +79,8 @@ if __name__ == "__main__":
"librispeech_other": process_librispeech, "librispeech_other": process_librispeech,
"voxceleb1": process_voxceleb1, "voxceleb1": process_voxceleb1,
"voxceleb2": process_voxceleb2, "voxceleb2": process_voxceleb2,
"aidatatang_200zh": process_aidatatang_200zh,
"magicdata": process_magicdata,
} }
for dataset in args.datasets: for dataset in args.datasets:

View File

@ -32,8 +32,9 @@ class LSTMSpeakerEncoder(nn.Layer):
normalized_embeds = F.normalize(embeds) normalized_embeds = F.normalize(embeds)
if reduce: if reduce:
embed = paddle.mean(normalized_embeds, 0) embed = paddle.mean(normalized_embeds, 0)
embed = F.normalize(embed, axis=0) embed = F.normalize(embed, axis=0)
return embed return embed
return normalized_embeds
def embed_utterance(self, utterances, initial_states=None): def embed_utterance(self, utterances, initial_states=None):
# utterances: [B, T, C] -> embed [C'] # utterances: [B, T, C] -> embed [C']