ParakeetEricRoss/examples/ge2e/inference.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
from pathlib import Path

import tqdm
import paddle
import numpy as np

from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder

from audio_processor import SpeakerVerificationPreprocessor
from config import get_cfg_defaults


def embed_utterance(processor, model, fpath_or_wav):
    # audio processor
    wav = processor.preprocess_wav(fpath_or_wav)
    mel_partials = processor.extract_mel_partials(wav)

    model.eval()
    # speaker encoder
    with paddle.no_grad():
        mel_partials = paddle.to_tensor(mel_partials)
        with paddle.no_grad():
            embed = model.embed_utterance(mel_partials)
    embed = embed.numpy()
    return embed


def _process_utterance(ifpath: Path,
                       input_dir: Path,
                       output_dir: Path,
                       processor: SpeakerVerificationPreprocessor,
                       model: LSTMSpeakerEncoder):
    rel_path = ifpath.relative_to(input_dir)
    ofpath = (output_dir / rel_path).with_suffix(".npy")
    ofpath.parent.mkdir(parents=True, exist_ok=True)
    embed = embed_utterance(processor, model, ifpath)
    np.save(ofpath, embed)


def main(config, args):
    paddle.set_device(args.device)

    # load model
    model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers,
                               config.model.hidden_size,
                               config.model.embedding_size)
    weights_fpath = str(Path(args.checkpoint_path).expanduser())
    model_state_dict = paddle.load(weights_fpath + ".pdparams")
    model.set_state_dict(model_state_dict)
    model.eval()
    print(f"Loaded encoder {weights_fpath}")

    # create audio processor
    c = config.data
    processor = SpeakerVerificationPreprocessor(
        sampling_rate=c.sampling_rate,
        audio_norm_target_dBFS=c.audio_norm_target_dBFS,
        vad_window_length=c.vad_window_length,
        vad_moving_average_width=c.vad_moving_average_width,
        vad_max_silence_length=c.vad_max_silence_length,
        mel_window_length=c.mel_window_length,
        mel_window_step=c.mel_window_step,
        n_mels=c.n_mels,
        partial_n_frames=c.partial_n_frames,
        min_pad_coverage=c.min_pad_coverage,
        partial_overlap_ratio=c.min_pad_coverage, )

    # input output preparation
    input_dir = Path(args.input).expanduser()
    ifpaths = list(input_dir.rglob(args.pattern))
    print(f"{len(ifpaths)} utterances in total")
    output_dir = Path(args.output).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)

    for ifpath in tqdm.tqdm(ifpaths, unit="utterance"):
        _process_utterance(ifpath, input_dir, output_dir, processor, model)


if __name__ == "__main__":
    config = get_cfg_defaults()
    parser = argparse.ArgumentParser(description="compute utterance embed.")
    parser.add_argument(
        "--config",
        metavar="FILE",
        help="path of the config file to overwrite to default config with.")
    parser.add_argument(
        "--input", type=str, help="path of the audio_file folder.")
    parser.add_argument(
        "--pattern",
        type=str,
        default="*.wav",
        help="pattern to filter audio files.")
    parser.add_argument(
        "--output",
        metavar="OUTPUT_DIR",
        help="path to save checkpoint and logs.")

    # load from saved checkpoint
    parser.add_argument(
        "--checkpoint_path", type=str, help="path of the checkpoint to load")

    # running
    parser.add_argument(
        "--device",
        type=str,
        choices=["cpu", "gpu"],
        help="device type to use, cpu and gpu are supported.")

    # overwrite extra config and default config
    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )

    args = parser.parse_args()
    if args.config:
        config.merge_from_file(args.config)
    if args.opts:
        config.merge_from_list(args.opts)
    config.freeze()
    print(config)
    print(args)

    main(config, args)
add ge2e and tacotron2_aishell3 example (#107) * hacky thing, add tone support for acoustic model * fix experiments for waveflow and wavenet, only write visual log in rank-0 * use emb add in tacotron2 * 1. remove space from numericalized representation; 2. fix decoder paddign mask's unsqueeze dim. * remove bn in postnet * refactoring code * add an option to normalize volume when loading audio. * add an embedding layer. * 1. change the default min value of LogMagnitude to 1e-5; 2. remove stop logit prediction from tacotron2 model. * WIP: baker * add ge2e * fix lstm speaker encoder * fix lstm speaker encoder * fix speaker encoder and add support for 2 more datasets * simplify visualization code * add a simple strategy to support multispeaker for tacotron. * add vctk example for refactored tacotron * fix indentation * fix class name * fix visualizer * fix root path * fix root path * fix root path * fix typos * fix bugs * fix text log extention name * add example for baker and aishell3 * update experiment and display * format code for tacotron_vctk, add plot_waveform to display * add new trainer * minor fix * add global condition support for tacotron2 * add gst layer * add 2 frontend * fix fmax for example/waveflow * update collate function, data loader not does not convert nested list into numpy array. * WIP: add hifigan * WIP:update hifigan * change stft to use conv1d * add audio datasets * change batch_text_id, batch_spec, batch_wav to include valid lengths in the returned value * change wavenet to use on-the-fly prepeocessing * fix typos * resolve conflict * remove imports that are removed * remove files not included in this release * remove imports to deleted modules * move tacotron2_msp * clean code * fix argument order * fix argument name * clean code for data processing * WIP: add README * add more details to thr README, fix some preprocess scripts * add voice cloning notebook * add an optional to alter the loss and model structure of tacotron2, add an alternative config * add plot_multiple_attentions and update visualization code in transformer_tts * format code * remove tacotron2_msp * update tacotron2 from_pretrained, update setup.py * update tacotron2 * update tacotron_aishell3's README * add images for exampels/tacotron2_aishell3's README * update README for examples/ge2e * add STFT back * add extra_config keys into the default config of tacotron * fix typos and docs * update README and doc * update docstrings for tacotron * update doc * update README * add links to downlaod pretrained models * refine READMEs and clean code * add praatio into requirements for running the experiments * format code with pre-commit * simplify text processing code and update notebook 2021-05-13 17:49:50 +08:00			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`import argparse`
			`from pathlib import Path`

			`import tqdm`
			`import paddle`
			`import numpy as np`

			`from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder`

			`from audio_processor import SpeakerVerificationPreprocessor`
			`from config import get_cfg_defaults`


			`def embed_utterance(processor, model, fpath_or_wav):`
			`# audio processor`
			`wav = processor.preprocess_wav(fpath_or_wav)`
			`mel_partials = processor.extract_mel_partials(wav)`

			`model.eval()`
			`# speaker encoder`
			`with paddle.no_grad():`
			`mel_partials = paddle.to_tensor(mel_partials)`
			`with paddle.no_grad():`
			`embed = model.embed_utterance(mel_partials)`
			`embed = embed.numpy()`
			`return embed`


			`def _process_utterance(ifpath: Path,`
			`input_dir: Path,`
			`output_dir: Path,`
			`processor: SpeakerVerificationPreprocessor,`
			`model: LSTMSpeakerEncoder):`
			`rel_path = ifpath.relative_to(input_dir)`
			`ofpath = (output_dir / rel_path).with_suffix(".npy")`
			`ofpath.parent.mkdir(parents=True, exist_ok=True)`
			`embed = embed_utterance(processor, model, ifpath)`
			`np.save(ofpath, embed)`


			`def main(config, args):`
			`paddle.set_device(args.device)`

			`# load model`
			`model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers,`
			`config.model.hidden_size,`
			`config.model.embedding_size)`
			`weights_fpath = str(Path(args.checkpoint_path).expanduser())`
			`model_state_dict = paddle.load(weights_fpath + ".pdparams")`
			`model.set_state_dict(model_state_dict)`
			`model.eval()`
			`print(f"Loaded encoder {weights_fpath}")`

			`# create audio processor`
			`c = config.data`
			`processor = SpeakerVerificationPreprocessor(`
			`sampling_rate=c.sampling_rate,`
			`audio_norm_target_dBFS=c.audio_norm_target_dBFS,`
			`vad_window_length=c.vad_window_length,`
			`vad_moving_average_width=c.vad_moving_average_width,`
			`vad_max_silence_length=c.vad_max_silence_length,`
			`mel_window_length=c.mel_window_length,`
			`mel_window_step=c.mel_window_step,`
			`n_mels=c.n_mels,`
			`partial_n_frames=c.partial_n_frames,`
			`min_pad_coverage=c.min_pad_coverage,`
			`partial_overlap_ratio=c.min_pad_coverage, )`

			`# input output preparation`
			`input_dir = Path(args.input).expanduser()`
			`ifpaths = list(input_dir.rglob(args.pattern))`
			`print(f"{len(ifpaths)} utterances in total")`
			`output_dir = Path(args.output).expanduser()`
			`output_dir.mkdir(parents=True, exist_ok=True)`

			`for ifpath in tqdm.tqdm(ifpaths, unit="utterance"):`
			`_process_utterance(ifpath, input_dir, output_dir, processor, model)`


			`if __name__ == "__main__":`
			`config = get_cfg_defaults()`
			`parser = argparse.ArgumentParser(description="compute utterance embed.")`
			`parser.add_argument(`
			`"--config",`
			`metavar="FILE",`
			`help="path of the config file to overwrite to default config with.")`
			`parser.add_argument(`
			`"--input", type=str, help="path of the audio_file folder.")`
			`parser.add_argument(`
			`"--pattern",`
			`type=str,`
			`default="*.wav",`
			`help="pattern to filter audio files.")`
			`parser.add_argument(`
			`"--output",`
			`metavar="OUTPUT_DIR",`
			`help="path to save checkpoint and logs.")`

			`# load from saved checkpoint`
			`parser.add_argument(`
			`"--checkpoint_path", type=str, help="path of the checkpoint to load")`

			`# running`
			`parser.add_argument(`
			`"--device",`
			`type=str,`
			`choices=["cpu", "gpu"],`
			`help="device type to use, cpu and gpu are supported.")`

			`# overwrite extra config and default config`
			`parser.add_argument(`
			`"--opts",`
			`nargs=argparse.REMAINDER,`
			`help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"`
			`)`

			`args = parser.parse_args()`
			`if args.config:`
			`config.merge_from_file(args.config)`
			`if args.opts:`
			`config.merge_from_list(args.opts)`
			`config.freeze()`
			`print(config)`
			`print(args)`

			`main(config, args)`