102 lines
4.0 KiB
Python
102 lines
4.0 KiB
Python
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
from config import get_cfg_defaults
|
|
from audio_processor import SpeakerVerificationPreprocessor
|
|
from dataset_processors import (process_librispeech, process_voxceleb1,
|
|
process_voxceleb2, process_aidatatang_200zh,
|
|
process_magicdata)
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
description="preprocess dataset for speaker verification task")
|
|
parser.add_argument(
|
|
"--datasets_root",
|
|
type=Path,
|
|
help="Path to the directory containing your LibriSpeech, LibriTTS and VoxCeleb datasets."
|
|
)
|
|
parser.add_argument(
|
|
"--output_dir", type=Path, help="Path to save processed dataset.")
|
|
parser.add_argument(
|
|
"--dataset_names",
|
|
type=str,
|
|
default="librispeech_other,voxceleb1,voxceleb2",
|
|
help="comma-separated list of names of the datasets you want to preprocess. only "
|
|
"the train set of these datastes will be used. Possible names: librispeech_other, "
|
|
"voxceleb1, voxceleb2, aidatatang_200zh, magicdata.")
|
|
parser.add_argument(
|
|
"--skip_existing",
|
|
action="store_true",
|
|
help="Whether to skip ouput files with the same name. Useful if this script was interrupted."
|
|
)
|
|
parser.add_argument(
|
|
"--no_trim",
|
|
action="store_true",
|
|
help="Preprocess audio without trimming silences (not recommended).")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.no_trim:
|
|
try:
|
|
import webrtcvad
|
|
print(webrtcvad.__version__)
|
|
except Exception as e:
|
|
raise ModuleNotFoundError(
|
|
"Package 'webrtcvad' not found. This package enables "
|
|
"noise removal and is recommended. Please install and "
|
|
"try again. If installation fails, "
|
|
"use --no_trim to disable this error message.")
|
|
del args.no_trim
|
|
|
|
args.datasets = [item.strip() for item in args.dataset_names.split(",")]
|
|
if not hasattr(args, "output_dir"):
|
|
args.output_dir = args.dataset_root / "SV2TTS" / "encoder"
|
|
|
|
args.output_dir = args.output_dir.expanduser()
|
|
args.datasets_root = args.datasets_root.expanduser()
|
|
assert args.datasets_root.exists()
|
|
args.output_dir.mkdir(exist_ok=True, parents=True)
|
|
|
|
config = get_cfg_defaults()
|
|
print(args)
|
|
|
|
c = config.data
|
|
processor = SpeakerVerificationPreprocessor(
|
|
sampling_rate=c.sampling_rate,
|
|
audio_norm_target_dBFS=c.audio_norm_target_dBFS,
|
|
vad_window_length=c.vad_window_length,
|
|
vad_moving_average_width=c.vad_moving_average_width,
|
|
vad_max_silence_length=c.vad_max_silence_length,
|
|
mel_window_length=c.mel_window_length,
|
|
mel_window_step=c.mel_window_step,
|
|
n_mels=c.n_mels,
|
|
partial_n_frames=c.partial_n_frames,
|
|
min_pad_coverage=c.min_pad_coverage,
|
|
partial_overlap_ratio=c.min_pad_coverage, )
|
|
|
|
preprocess_func = {
|
|
"librispeech_other": process_librispeech,
|
|
"voxceleb1": process_voxceleb1,
|
|
"voxceleb2": process_voxceleb2,
|
|
"aidatatang_200zh": process_aidatatang_200zh,
|
|
"magicdata": process_magicdata,
|
|
}
|
|
|
|
for dataset in args.datasets:
|
|
print("Preprocessing %s" % dataset)
|
|
preprocess_func[dataset](processor, args.datasets_root, args.output_dir,
|
|
args.skip_existing)
|