ParakeetRebeccaRosario/examples/ge2e/audio_processor.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path
from warnings import warn
import struct

from scipy.ndimage.morphology import binary_dilation
import numpy as np
import librosa

try:
    import webrtcvad
except ModuleNotFoundError:
    warn("Unable to import 'webrtcvad'."
         "This package enables noise removal and is recommended.")
    webrtcvad = None

INT16_MAX = (2**15) - 1


def normalize_volume(wav,
                     target_dBFS,
                     increase_only=False,
                     decrease_only=False):
    # this function implements Loudness normalization, instead of peak
    # normalization, See https://en.wikipedia.org/wiki/Audio_normalization
    # dBFS: Decibels relative to full scale
    # See https://en.wikipedia.org/wiki/DBFS for more details
    # for 16Bit PCM audio, minimal level is -96dB
    # compute the mean dBFS and adjust to target dBFS, with by increasing
    # or decreasing
    if increase_only and decrease_only:
        raise ValueError("Both increase only and decrease only are set")
    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))
    if ((dBFS_change < 0 and increase_only) or
        (dBFS_change > 0 and decrease_only)):
        return wav
    gain = 10**(dBFS_change / 20)
    return wav * gain


def trim_long_silences(wav,
                       vad_window_length: int,
                       vad_moving_average_width: int,
                       vad_max_silence_length: int,
                       sampling_rate: int):
    """
    Ensures that segments without voice in the waveform remain no longer than a
    threshold determined by the VAD parameters in params.py.

    :param wav: the raw waveform as a numpy array of floats
    :return: the same waveform with silences trimmed away (length <= original wav length)
    """
    # Compute the voice detection window size
    samples_per_window = (vad_window_length * sampling_rate) // 1000

    # Trim the end of the audio to have a multiple of the window size
    wav = wav[:len(wav) - (len(wav) % samples_per_window)]

    # Convert the float waveform to 16-bit mono PCM
    pcm_wave = struct.pack("%dh" % len(wav),
                           *(np.round(wav * INT16_MAX)).astype(np.int16))

    # Perform voice activation detection
    voice_flags = []
    vad = webrtcvad.Vad(mode=3)
    for window_start in range(0, len(wav), samples_per_window):
        window_end = window_start + samples_per_window
        voice_flags.append(
            vad.is_speech(
                pcm_wave[window_start * 2:window_end * 2],
                sample_rate=sampling_rate))
    voice_flags = np.array(voice_flags)

    # Smooth the voice detection with a moving average
    def moving_average(array, width):
        array_padded = np.concatenate((np.zeros((width - 1) // 2), array,
                                       np.zeros(width // 2)))
        ret = np.cumsum(array_padded, dtype=float)
        ret[width:] = ret[width:] - ret[:-width]
        return ret[width - 1:] / width

    audio_mask = moving_average(voice_flags, vad_moving_average_width)
    audio_mask = np.round(audio_mask).astype(np.bool)

    # Dilate the voiced regions
    audio_mask = binary_dilation(audio_mask,
                                 np.ones(vad_max_silence_length + 1))
    audio_mask = np.repeat(audio_mask, samples_per_window)

    return wav[audio_mask]


def compute_partial_slices(n_samples: int,
                           partial_utterance_n_frames: int,
                           hop_length: int,
                           min_pad_coverage: float=0.75,
                           overlap: float=0.5):
    """
    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
    spectrogram slices are returned, so as to make each partial utterance waveform correspond to
    its spectrogram. This function assumes that the mel spectrogram parameters used are those
    defined in params_data.py.

    The returned ranges may be indexing further than the length of the waveform. It is
    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.

    :param n_samples: the number of samples in the waveform
    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
    utterance
    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
    then the last partial utterance will be considered, as if we padded the audio. Otherwise,
    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
    utterance, this parameter is ignored so that the function always returns at least 1 slice.
    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
    utterances are entirely disjoint.
    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
    respectively the waveform and the mel spectrogram with these slices to obtain the partial
    utterances.
    """
    assert 0 <= overlap < 1
    assert 0 < min_pad_coverage <= 1

    # librosa's function to compute num_frames from num_samples
    n_frames = int(np.ceil((n_samples + 1) / hop_length))
    # frame shift between ajacent partials
    frame_step = max(
        1, int(np.round(partial_utterance_n_frames * (1 - overlap))))

    # Compute the slices
    wav_slices, mel_slices = [], []
    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
    for i in range(0, steps, frame_step):
        mel_range = np.array([i, i + partial_utterance_n_frames])
        wav_range = mel_range * hop_length
        mel_slices.append(slice(*mel_range))
        wav_slices.append(slice(*wav_range))

    # Evaluate whether extra padding is warranted or not
    last_wav_range = wav_slices[-1]
    coverage = (n_samples - last_wav_range.start) / (
        last_wav_range.stop - last_wav_range.start)
    if coverage < min_pad_coverage and len(mel_slices) > 1:
        mel_slices = mel_slices[:-1]
        wav_slices = wav_slices[:-1]

    return wav_slices, mel_slices


class SpeakerVerificationPreprocessor(object):
    def __init__(self,
                 sampling_rate: int,
                 audio_norm_target_dBFS: float,
                 vad_window_length,
                 vad_moving_average_width,
                 vad_max_silence_length,
                 mel_window_length,
                 mel_window_step,
                 n_mels,
                 partial_n_frames: int,
                 min_pad_coverage: float=0.75,
                 partial_overlap_ratio: float=0.5):
        self.sampling_rate = sampling_rate
        self.audio_norm_target_dBFS = audio_norm_target_dBFS

        self.vad_window_length = vad_window_length
        self.vad_moving_average_width = vad_moving_average_width
        self.vad_max_silence_length = vad_max_silence_length

        self.n_fft = int(mel_window_length * sampling_rate / 1000)
        self.hop_length = int(mel_window_step * sampling_rate / 1000)
        self.n_mels = n_mels

        self.partial_n_frames = partial_n_frames
        self.min_pad_coverage = min_pad_coverage
        self.partial_overlap_ratio = partial_overlap_ratio

    def preprocess_wav(self, fpath_or_wav, source_sr=None):
        # Load the wav from disk if needed
        if isinstance(fpath_or_wav, (str, Path)):
            wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
        else:
            wav = fpath_or_wav

        # Resample if numpy.array is passed and sr does not match
        if source_sr is not None and source_sr != self.sampling_rate:
            wav = librosa.resample(wav, source_sr, self.sampling_rate)

        # loudness normalization
        wav = normalize_volume(
            wav, self.audio_norm_target_dBFS, increase_only=True)

        # trim long silence
        if webrtcvad:
            wav = trim_long_silences(
                wav, self.vad_window_length, self.vad_moving_average_width,
                self.vad_max_silence_length, self.sampling_rate)
        return wav

    def melspectrogram(self, wav):
        mel = librosa.feature.melspectrogram(
            wav,
            sr=self.sampling_rate,
            n_fft=self.n_fft,
            hop_length=self.hop_length,
            n_mels=self.n_mels)
        mel = mel.astype(np.float32).T
        return mel

    def extract_mel_partials(self, wav):
        wav_slices, mel_slices = compute_partial_slices(
            len(wav), self.partial_n_frames, self.hop_length,
            self.min_pad_coverage, self.partial_overlap_ratio)

        # pad audio if needed
        max_wave_length = wav_slices[-1].stop
        if max_wave_length >= len(wav):
            wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")

        # Split the utterance into partials
        frames = self.melspectrogram(wav)
        frames_batch = np.array([frames[s] for s in mel_slices])
        return frames_batch  # [B, T, C]
add ge2e and tacotron2_aishell3 example (#107) * hacky thing, add tone support for acoustic model * fix experiments for waveflow and wavenet, only write visual log in rank-0 * use emb add in tacotron2 * 1. remove space from numericalized representation; 2. fix decoder paddign mask's unsqueeze dim. * remove bn in postnet * refactoring code * add an option to normalize volume when loading audio. * add an embedding layer. * 1. change the default min value of LogMagnitude to 1e-5; 2. remove stop logit prediction from tacotron2 model. * WIP: baker * add ge2e * fix lstm speaker encoder * fix lstm speaker encoder * fix speaker encoder and add support for 2 more datasets * simplify visualization code * add a simple strategy to support multispeaker for tacotron. * add vctk example for refactored tacotron * fix indentation * fix class name * fix visualizer * fix root path * fix root path * fix root path * fix typos * fix bugs * fix text log extention name * add example for baker and aishell3 * update experiment and display * format code for tacotron_vctk, add plot_waveform to display * add new trainer * minor fix * add global condition support for tacotron2 * add gst layer * add 2 frontend * fix fmax for example/waveflow * update collate function, data loader not does not convert nested list into numpy array. * WIP: add hifigan * WIP:update hifigan * change stft to use conv1d * add audio datasets * change batch_text_id, batch_spec, batch_wav to include valid lengths in the returned value * change wavenet to use on-the-fly prepeocessing * fix typos * resolve conflict * remove imports that are removed * remove files not included in this release * remove imports to deleted modules * move tacotron2_msp * clean code * fix argument order * fix argument name * clean code for data processing * WIP: add README * add more details to thr README, fix some preprocess scripts * add voice cloning notebook * add an optional to alter the loss and model structure of tacotron2, add an alternative config * add plot_multiple_attentions and update visualization code in transformer_tts * format code * remove tacotron2_msp * update tacotron2 from_pretrained, update setup.py * update tacotron2 * update tacotron_aishell3's README * add images for exampels/tacotron2_aishell3's README * update README for examples/ge2e * add STFT back * add extra_config keys into the default config of tacotron * fix typos and docs * update README and doc * update docstrings for tacotron * update doc * update README * add links to downlaod pretrained models * refine READMEs and clean code * add praatio into requirements for running the experiments * format code with pre-commit * simplify text processing code and update notebook 2021-05-13 17:49:50 +08:00			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`from pathlib import Path`
			`from warnings import warn`
			`import struct`

			`from scipy.ndimage.morphology import binary_dilation`
			`import numpy as np`
			`import librosa`

			`try:`
			`import webrtcvad`
			`except ModuleNotFoundError:`
			`warn("Unable to import 'webrtcvad'."`
			`"This package enables noise removal and is recommended.")`
			`webrtcvad = None`

			`INT16_MAX = (2**15) - 1`


			`def normalize_volume(wav,`
			`target_dBFS,`
			`increase_only=False,`
			`decrease_only=False):`
			`# this function implements Loudness normalization, instead of peak`
			`# normalization, See https://en.wikipedia.org/wiki/Audio_normalization`
			`# dBFS: Decibels relative to full scale`
			`# See https://en.wikipedia.org/wiki/DBFS for more details`
			`# for 16Bit PCM audio, minimal level is -96dB`
			`# compute the mean dBFS and adjust to target dBFS, with by increasing`
			`# or decreasing`
			`if increase_only and decrease_only:`
			`raise ValueError("Both increase only and decrease only are set")`
			`dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))`
			`if ((dBFS_change < 0 and increase_only) or`
			`(dBFS_change > 0 and decrease_only)):`
			`return wav`
			`gain = 10**(dBFS_change / 20)`
			`return wav * gain`


			`def trim_long_silences(wav,`
			`vad_window_length: int,`
			`vad_moving_average_width: int,`
			`vad_max_silence_length: int,`
			`sampling_rate: int):`
			`"""`
			`Ensures that segments without voice in the waveform remain no longer than a`
			`threshold determined by the VAD parameters in params.py.`

			`:param wav: the raw waveform as a numpy array of floats`
			`:return: the same waveform with silences trimmed away (length <= original wav length)`
			`"""`
			`# Compute the voice detection window size`
			`samples_per_window = (vad_window_length * sampling_rate) // 1000`

			`# Trim the end of the audio to have a multiple of the window size`
			`wav = wav[:len(wav) - (len(wav) % samples_per_window)]`

			`# Convert the float waveform to 16-bit mono PCM`
			`pcm_wave = struct.pack("%dh" % len(wav),`
			`(np.round(wav INT16_MAX)).astype(np.int16))`

			`# Perform voice activation detection`
			`voice_flags = []`
			`vad = webrtcvad.Vad(mode=3)`
			`for window_start in range(0, len(wav), samples_per_window):`
			`window_end = window_start + samples_per_window`
			`voice_flags.append(`
			`vad.is_speech(`
			`pcm_wave[window_start * 2:window_end * 2],`
			`sample_rate=sampling_rate))`
			`voice_flags = np.array(voice_flags)`

			`# Smooth the voice detection with a moving average`
			`def moving_average(array, width):`
			`array_padded = np.concatenate((np.zeros((width - 1) // 2), array,`
			`np.zeros(width // 2)))`
			`ret = np.cumsum(array_padded, dtype=float)`
			`ret[width:] = ret[width:] - ret[:-width]`
			`return ret[width - 1:] / width`

			`audio_mask = moving_average(voice_flags, vad_moving_average_width)`
			`audio_mask = np.round(audio_mask).astype(np.bool)`

			`# Dilate the voiced regions`
			`audio_mask = binary_dilation(audio_mask,`
			`np.ones(vad_max_silence_length + 1))`
			`audio_mask = np.repeat(audio_mask, samples_per_window)`

			`return wav[audio_mask]`


			`def compute_partial_slices(n_samples: int,`
			`partial_utterance_n_frames: int,`
			`hop_length: int,`
			`min_pad_coverage: float=0.75,`
			`overlap: float=0.5):`
			`"""`
			`Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain`
			`partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel`
			`spectrogram slices are returned, so as to make each partial utterance waveform correspond to`
			`its spectrogram. This function assumes that the mel spectrogram parameters used are those`
			`defined in params_data.py.`

			`The returned ranges may be indexing further than the length of the waveform. It is`
			`recommended that you pad the waveform with zeros up to wave_slices[-1].stop.`

			`:param n_samples: the number of samples in the waveform`
			`:param partial_utterance_n_frames: the number of mel spectrogram frames in each partial`
			`utterance`
			`:param min_pad_coverage: when reaching the last partial utterance, it may or may not have`
			`enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,`
			`then the last partial utterance will be considered, as if we padded the audio. Otherwise,`
			`it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial`
			`utterance, this parameter is ignored so that the function always returns at least 1 slice.`
			`:param overlap: by how much the partial utterance should overlap. If set to 0, the partial`
			`utterances are entirely disjoint.`
			`:return: the waveform slices and mel spectrogram slices as lists of array slices. Index`
			`respectively the waveform and the mel spectrogram with these slices to obtain the partial`
			`utterances.`
			`"""`
			`assert 0 <= overlap < 1`
			`assert 0 < min_pad_coverage <= 1`

			`# librosa's function to compute num_frames from num_samples`
			`n_frames = int(np.ceil((n_samples + 1) / hop_length))`
			`# frame shift between ajacent partials`
			`frame_step = max(`
			`1, int(np.round(partial_utterance_n_frames * (1 - overlap))))`

			`# Compute the slices`
			`wav_slices, mel_slices = [], []`
			`steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)`
			`for i in range(0, steps, frame_step):`
			`mel_range = np.array([i, i + partial_utterance_n_frames])`
			`wav_range = mel_range * hop_length`
			`mel_slices.append(slice(*mel_range))`
			`wav_slices.append(slice(*wav_range))`

			`# Evaluate whether extra padding is warranted or not`
			`last_wav_range = wav_slices[-1]`
			`coverage = (n_samples - last_wav_range.start) / (`
			`last_wav_range.stop - last_wav_range.start)`
			`if coverage < min_pad_coverage and len(mel_slices) > 1:`
			`mel_slices = mel_slices[:-1]`
			`wav_slices = wav_slices[:-1]`

			`return wav_slices, mel_slices`


			`class SpeakerVerificationPreprocessor(object):`
			`def __init__(self,`
			`sampling_rate: int,`
			`audio_norm_target_dBFS: float,`
			`vad_window_length,`
			`vad_moving_average_width,`
			`vad_max_silence_length,`
			`mel_window_length,`
			`mel_window_step,`
			`n_mels,`
			`partial_n_frames: int,`
			`min_pad_coverage: float=0.75,`
			`partial_overlap_ratio: float=0.5):`
			`self.sampling_rate = sampling_rate`
			`self.audio_norm_target_dBFS = audio_norm_target_dBFS`

			`self.vad_window_length = vad_window_length`
			`self.vad_moving_average_width = vad_moving_average_width`
			`self.vad_max_silence_length = vad_max_silence_length`

			`self.n_fft = int(mel_window_length * sampling_rate / 1000)`
			`self.hop_length = int(mel_window_step * sampling_rate / 1000)`
			`self.n_mels = n_mels`

			`self.partial_n_frames = partial_n_frames`
			`self.min_pad_coverage = min_pad_coverage`
			`self.partial_overlap_ratio = partial_overlap_ratio`

			`def preprocess_wav(self, fpath_or_wav, source_sr=None):`
			`# Load the wav from disk if needed`
			`if isinstance(fpath_or_wav, (str, Path)):`
			`wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)`
			`else:`
			`wav = fpath_or_wav`

			`# Resample if numpy.array is passed and sr does not match`
			`if source_sr is not None and source_sr != self.sampling_rate:`
			`wav = librosa.resample(wav, source_sr, self.sampling_rate)`

			`# loudness normalization`
			`wav = normalize_volume(`
			`wav, self.audio_norm_target_dBFS, increase_only=True)`

			`# trim long silence`
			`if webrtcvad:`
			`wav = trim_long_silences(`
			`wav, self.vad_window_length, self.vad_moving_average_width,`
			`self.vad_max_silence_length, self.sampling_rate)`
			`return wav`

			`def melspectrogram(self, wav):`
			`mel = librosa.feature.melspectrogram(`
			`wav,`
			`sr=self.sampling_rate,`
			`n_fft=self.n_fft,`
			`hop_length=self.hop_length,`
			`n_mels=self.n_mels)`
			`mel = mel.astype(np.float32).T`
			`return mel`

			`def extract_mel_partials(self, wav):`
			`wav_slices, mel_slices = compute_partial_slices(`
			`len(wav), self.partial_n_frames, self.hop_length,`
			`self.min_pad_coverage, self.partial_overlap_ratio)`

			`# pad audio if needed`
			`max_wave_length = wav_slices[-1].stop`
			`if max_wave_length >= len(wav):`
			`wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")`

			`# Split the utterance into partials`
			`frames = self.melspectrogram(wav)`
			`frames_batch = np.array([frames[s] for s in mel_slices])`
			`return frames_batch # [B, T, C]`