fix speaker encoder and add support for 2 more datasets
This commit is contained in:
parent
59ed247840
commit
b5dd0cc197
|
@ -137,3 +137,26 @@ def process_voxceleb2(processor,
|
||||||
speaker_dirs = list((dataset_root / "wav").glob("*"))
|
speaker_dirs = list((dataset_root / "wav").glob("*"))
|
||||||
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
|
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
|
||||||
output_dir, "*.wav", skip_existing)
|
output_dir, "*.wav", skip_existing)
|
||||||
|
|
||||||
|
def process_aidatatang_200zh(processor,
|
||||||
|
datasets_root,
|
||||||
|
output_dir,
|
||||||
|
skip_existing=False):
|
||||||
|
dataset_name = "aidatatang_200zh/train"
|
||||||
|
dataset_root = datasets_root / dataset_name
|
||||||
|
|
||||||
|
speaker_dirs = list((dataset_root).glob("*"))
|
||||||
|
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
|
||||||
|
output_dir, "*.wav", skip_existing)
|
||||||
|
|
||||||
|
|
||||||
|
def process_magicdata(processor,
|
||||||
|
datasets_root,
|
||||||
|
output_dir,
|
||||||
|
skip_existing=False):
|
||||||
|
dataset_name = "magicdata/train"
|
||||||
|
dataset_root = datasets_root / dataset_name
|
||||||
|
|
||||||
|
speaker_dirs = list((dataset_root).glob("*"))
|
||||||
|
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
|
||||||
|
output_dir, "*.wav", skip_existing)
|
||||||
|
|
|
@ -2,7 +2,7 @@ import argparse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from config import get_cfg_defaults
|
from config import get_cfg_defaults
|
||||||
from audio_processor import SpeakerVerificationPreprocessor
|
from audio_processor import SpeakerVerificationPreprocessor
|
||||||
from dataset_processors import process_librispeech, process_voxceleb1, process_voxceleb2
|
from dataset_processors import process_librispeech, process_voxceleb1, process_voxceleb2, process_aidatatang_200zh, process_magicdata
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
|
@ -23,7 +23,7 @@ if __name__ == "__main__":
|
||||||
help=
|
help=
|
||||||
"comma-separated list of names of the datasets you want to preprocess. only "
|
"comma-separated list of names of the datasets you want to preprocess. only "
|
||||||
"the train set of these datastes will be used. Possible names: librispeech_other, "
|
"the train set of these datastes will be used. Possible names: librispeech_other, "
|
||||||
"voxceleb1, voxceleb2.")
|
"voxceleb1, voxceleb2, aidatatang_200zh, magicdata.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--skip_existing",
|
"--skip_existing",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
|
@ -79,6 +79,8 @@ if __name__ == "__main__":
|
||||||
"librispeech_other": process_librispeech,
|
"librispeech_other": process_librispeech,
|
||||||
"voxceleb1": process_voxceleb1,
|
"voxceleb1": process_voxceleb1,
|
||||||
"voxceleb2": process_voxceleb2,
|
"voxceleb2": process_voxceleb2,
|
||||||
|
"aidatatang_200zh": process_aidatatang_200zh,
|
||||||
|
"magicdata": process_magicdata,
|
||||||
}
|
}
|
||||||
|
|
||||||
for dataset in args.datasets:
|
for dataset in args.datasets:
|
||||||
|
|
|
@ -32,8 +32,9 @@ class LSTMSpeakerEncoder(nn.Layer):
|
||||||
normalized_embeds = F.normalize(embeds)
|
normalized_embeds = F.normalize(embeds)
|
||||||
if reduce:
|
if reduce:
|
||||||
embed = paddle.mean(normalized_embeds, 0)
|
embed = paddle.mean(normalized_embeds, 0)
|
||||||
embed = F.normalize(embed, axis=0)
|
embed = F.normalize(embed, axis=0)
|
||||||
return embed
|
return embed
|
||||||
|
return normalized_embeds
|
||||||
|
|
||||||
def embed_utterance(self, utterances, initial_states=None):
|
def embed_utterance(self, utterances, initial_states=None):
|
||||||
# utterances: [B, T, C] -> embed [C']
|
# utterances: [B, T, C] -> embed [C']
|
||||||
|
|
Loading…
Reference in New Issue