Parakeet/examples/ge2e/preprocess.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
from pathlib import Path
from config import get_cfg_defaults
from audio_processor import SpeakerVerificationPreprocessor
from dataset_processors import (process_librispeech, process_voxceleb1,
                                process_voxceleb2, process_aidatatang_200zh,
                                process_magicdata)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="preprocess dataset for speaker verification task")
    parser.add_argument(
        "--datasets_root",
        type=Path,
        help="Path to the directory containing your LibriSpeech, LibriTTS and VoxCeleb datasets."
    )
    parser.add_argument(
        "--output_dir", type=Path, help="Path to save processed dataset.")
    parser.add_argument(
        "--dataset_names",
        type=str,
        default="librispeech_other,voxceleb1,voxceleb2",
        help="comma-separated list of names of the datasets you want to preprocess. only "
        "the train set of these datastes will be used. Possible names: librispeech_other, "
        "voxceleb1, voxceleb2, aidatatang_200zh, magicdata.")
    parser.add_argument(
        "--skip_existing",
        action="store_true",
        help="Whether to skip ouput files with the same name. Useful if this script was interrupted."
    )
    parser.add_argument(
        "--no_trim",
        action="store_true",
        help="Preprocess audio without trimming silences (not recommended).")

    args = parser.parse_args()

    if not args.no_trim:
        try:
            import webrtcvad
        except:
            raise ModuleNotFoundError(
                "Package 'webrtcvad' not found. This package enables "
                "noise removal and is recommended. Please install and "
                "try again. If installation fails, "
                "use --no_trim to disable this error message.")
    del args.no_trim

    args.datasets = [item.strip() for item in args.dataset_names.split(",")]
    if not hasattr(args, "output_dir"):
        args.output_dir = args.dataset_root / "SV2TTS" / "encoder"

    args.output_dir = args.output_dir.expanduser()
    args.datasets_root = args.datasets_root.expanduser()
    assert args.datasets_root.exists()
    args.output_dir.mkdir(exist_ok=True, parents=True)

    config = get_cfg_defaults()
    print(args)

    c = config.data
    processor = SpeakerVerificationPreprocessor(
        sampling_rate=c.sampling_rate,
        audio_norm_target_dBFS=c.audio_norm_target_dBFS,
        vad_window_length=c.vad_window_length,
        vad_moving_average_width=c.vad_moving_average_width,
        vad_max_silence_length=c.vad_max_silence_length,
        mel_window_length=c.mel_window_length,
        mel_window_step=c.mel_window_step,
        n_mels=c.n_mels,
        partial_n_frames=c.partial_n_frames,
        min_pad_coverage=c.min_pad_coverage,
        partial_overlap_ratio=c.min_pad_coverage, )

    preprocess_func = {
        "librispeech_other": process_librispeech,
        "voxceleb1": process_voxceleb1,
        "voxceleb2": process_voxceleb2,
        "aidatatang_200zh": process_aidatatang_200zh,
        "magicdata": process_magicdata,
    }

    for dataset in args.datasets:
        print("Preprocessing %s" % dataset)
        preprocess_func[dataset](processor, args.datasets_root,
                                 args.output_dir, args.skip_existing)
add ge2e and tacotron2_aishell3 example (#107) * hacky thing, add tone support for acoustic model * fix experiments for waveflow and wavenet, only write visual log in rank-0 * use emb add in tacotron2 * 1. remove space from numericalized representation; 2. fix decoder paddign mask's unsqueeze dim. * remove bn in postnet * refactoring code * add an option to normalize volume when loading audio. * add an embedding layer. * 1. change the default min value of LogMagnitude to 1e-5; 2. remove stop logit prediction from tacotron2 model. * WIP: baker * add ge2e * fix lstm speaker encoder * fix lstm speaker encoder * fix speaker encoder and add support for 2 more datasets * simplify visualization code * add a simple strategy to support multispeaker for tacotron. * add vctk example for refactored tacotron * fix indentation * fix class name * fix visualizer * fix root path * fix root path * fix root path * fix typos * fix bugs * fix text log extention name * add example for baker and aishell3 * update experiment and display * format code for tacotron_vctk, add plot_waveform to display * add new trainer * minor fix * add global condition support for tacotron2 * add gst layer * add 2 frontend * fix fmax for example/waveflow * update collate function, data loader not does not convert nested list into numpy array. * WIP: add hifigan * WIP:update hifigan * change stft to use conv1d * add audio datasets * change batch_text_id, batch_spec, batch_wav to include valid lengths in the returned value * change wavenet to use on-the-fly prepeocessing * fix typos * resolve conflict * remove imports that are removed * remove files not included in this release * remove imports to deleted modules * move tacotron2_msp * clean code * fix argument order * fix argument name * clean code for data processing * WIP: add README * add more details to thr README, fix some preprocess scripts * add voice cloning notebook * add an optional to alter the loss and model structure of tacotron2, add an alternative config * add plot_multiple_attentions and update visualization code in transformer_tts * format code * remove tacotron2_msp * update tacotron2 from_pretrained, update setup.py * update tacotron2 * update tacotron_aishell3's README * add images for exampels/tacotron2_aishell3's README * update README for examples/ge2e * add STFT back * add extra_config keys into the default config of tacotron * fix typos and docs * update README and doc * update docstrings for tacotron * update doc * update README * add links to downlaod pretrained models * refine READMEs and clean code * add praatio into requirements for running the experiments * format code with pre-commit * simplify text processing code and update notebook 2021-05-13 17:49:50 +08:00			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`import argparse`
			`from pathlib import Path`
			`from config import get_cfg_defaults`
			`from audio_processor import SpeakerVerificationPreprocessor`
			`from dataset_processors import (process_librispeech, process_voxceleb1,`
			`process_voxceleb2, process_aidatatang_200zh,`
			`process_magicdata)`

			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser(`
			`description="preprocess dataset for speaker verification task")`
			`parser.add_argument(`
			`"--datasets_root",`
			`type=Path,`
			`help="Path to the directory containing your LibriSpeech, LibriTTS and VoxCeleb datasets."`
			`)`
			`parser.add_argument(`
			`"--output_dir", type=Path, help="Path to save processed dataset.")`
			`parser.add_argument(`
			`"--dataset_names",`
			`type=str,`
			`default="librispeech_other,voxceleb1,voxceleb2",`
			`help="comma-separated list of names of the datasets you want to preprocess. only "`
			`"the train set of these datastes will be used. Possible names: librispeech_other, "`
			`"voxceleb1, voxceleb2, aidatatang_200zh, magicdata.")`
			`parser.add_argument(`
			`"--skip_existing",`
			`action="store_true",`
			`help="Whether to skip ouput files with the same name. Useful if this script was interrupted."`
			`)`
			`parser.add_argument(`
			`"--no_trim",`
			`action="store_true",`
			`help="Preprocess audio without trimming silences (not recommended).")`

			`args = parser.parse_args()`

			`if not args.no_trim:`
			`try:`
			`import webrtcvad`
			`except:`
			`raise ModuleNotFoundError(`
			`"Package 'webrtcvad' not found. This package enables "`
			`"noise removal and is recommended. Please install and "`
			`"try again. If installation fails, "`
			`"use --no_trim to disable this error message.")`
			`del args.no_trim`

			`args.datasets = [item.strip() for item in args.dataset_names.split(",")]`
			`if not hasattr(args, "output_dir"):`
			`args.output_dir = args.dataset_root / "SV2TTS" / "encoder"`

			`args.output_dir = args.output_dir.expanduser()`
			`args.datasets_root = args.datasets_root.expanduser()`
			`assert args.datasets_root.exists()`
			`args.output_dir.mkdir(exist_ok=True, parents=True)`

			`config = get_cfg_defaults()`
			`print(args)`

			`c = config.data`
			`processor = SpeakerVerificationPreprocessor(`
			`sampling_rate=c.sampling_rate,`
			`audio_norm_target_dBFS=c.audio_norm_target_dBFS,`
			`vad_window_length=c.vad_window_length,`
			`vad_moving_average_width=c.vad_moving_average_width,`
			`vad_max_silence_length=c.vad_max_silence_length,`
			`mel_window_length=c.mel_window_length,`
			`mel_window_step=c.mel_window_step,`
			`n_mels=c.n_mels,`
			`partial_n_frames=c.partial_n_frames,`
			`min_pad_coverage=c.min_pad_coverage,`
			`partial_overlap_ratio=c.min_pad_coverage, )`

			`preprocess_func = {`
			`"librispeech_other": process_librispeech,`
			`"voxceleb1": process_voxceleb1,`
			`"voxceleb2": process_voxceleb2,`
			`"aidatatang_200zh": process_aidatatang_200zh,`
			`"magicdata": process_magicdata,`
			`}`

			`for dataset in args.datasets:`
			`print("Preprocessing %s" % dataset)`
			`preprocess_func[dataset](processor, args.datasets_root,`
			`args.output_dir, args.skip_existing)`