add AudioDataset and MelDataset

2021-06-10 16:26:09 +08:00 · 2021-06-10 16:26:09 +08:00 · 6a8b3f92df
parent 3c964fde54
commit 6a8b3f92df
2 changed files with 265 additions and 0 deletions
--- a/parakeet/datasets/audio_dataset.py
+++ b/parakeet/datasets/audio_dataset.py
@ -0,0 +1,133 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Union, Optional, Callable, Tuple
 from pathlib import Path
 from multiprocessing import Manager
 import numpy as np
 from paddle.io import Dataset
 import logging
 class AudioDataset(Dataset):
    """Dataset to load audio.
    Parameters
    ----------
    root_dir : Union[Path, str]
        The root of the dataset.
    audio_pattern : str
        A pattern to recursively find all audio files, by default "*-wave.npy"
    audio_length_threshold : int, optional
        The minmimal length(number of samples) of the audio, by default None
    audio_load_fn : Callable, optional
        Function to load the audio, which takes a Path object or str as input, 
        by default np.load
    return_utt_id : bool, optional
        Whether to include utterance indentifier in the return value of 
        __getitem__, by default False
    use_cache : bool, optional
        Whether to cache seen examples while reading, by default False
    """
    def __init__(
            self,
            root_dir: Union[Path, str],
            audio_pattern: str="*-wave.npy",
            audio_length_threshold: Optional[int]=None,
            audio_load_fn: Callable=np.load,
            return_utt_id: bool=False,
            use_cache: bool=False, ):
        # allow str and Path that contains '~'
        root_dir = Path(root_dir).expanduser()
        # recursively find all of audio files that match thr pattern
        audio_files = sorted(list(root_dir.rglob(audio_pattern)))
        # filter by threshold
        if audio_length_threshold is not None:
            audio_lengths = [audio_load_fn(f).shape[0] for f in audio_files]
            idxs = [
                idx for idx in range(len(audio_files))
                if audio_lengths[idx] > audio_length_threshold
            ]
            if len(audio_files) != len(idxs):
                logging.warning(
                    f"some files are filtered by audio length threshold "
                    f"({len(audio_files)} -> {len(idxs)}).")
            audio_files = [audio_files[idx] for idx in idxs]
        # assert the number of files
        assert len(
            audio_files) != 0, f"Not any audio files found in {root_dir}."
        self.audio_files = audio_files
        self.audio_load_fn = audio_load_fn
        self.return_utt_id = return_utt_id
        # TODO(chenfeiyu): better strategy to get utterance id
        if ".npy" in audio_pattern:
            self.utt_ids = [
                f.name.replace("-wave.npy", "") for f in audio_files
            ]
        else:
            self.utt_ids = [f.stem for f in audio_files]
        self.use_cache = use_cache
        if use_cache:
            # use manager to share object between multiple processes
            # avoid per-reader process caching
            self.manager = Manager()
            self.caches = self.manager.list()
            self.caches += [None for _ in range(len(audio_files))]
    def __getitem__(self, idx: int) -> Tuple[str, np.ndarray]:
        """Get an example given the index.
        Parameters
        ----------
        idx : int
            The index.
        Returns
        -------
        utt_id : str
            Utterance identifier.
        audio : np.ndarray
            Shape (n_samples, ), the audio.
        """
        if self.use_cache and self.caches[idx] is not None:
            return self.caches[idx]
        utt_id = self.utt_ids[idx]
        audio = self.audio_load_fn(self.audio_files[idx])
        if self.return_utt_id:
            items = utt_id, audio
        else:
            items = audio
        if self.use_cache:
            self.caches[idx] = items
        return items
    def __len__(self) -> int:
        """Returns the size of the dataset.
        Returns
        -------
        int
            The length of the dataset
        """
        return len(self.audio_files)
--- a/parakeet/datasets/mel_dataset.py
+++ b/parakeet/datasets/mel_dataset.py
@ -0,0 +1,132 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Union, Optional, Callable, Tuple
 from pathlib import Path
 from multiprocessing import Manager
 import numpy as np
 from paddle.io import Dataset
 import logging
 class MelDataset(Dataset):
    """Dataset to load mel-spectrograms.
    Parameters
    ----------
    root_dir : Union[Path, str]
        The root of the dataset.
    mel_pattern : str, optional
        A pattern to recursively find all mel feature files, by default 
        "*-feats.npy"
    mel_length_threshold : Optional[int], optional
        The minmimal length(number of frames) of the audio, by default None
    mel_load_fn : Callable, optional
        Function to load the audio, which takes a Path object or str as input, 
        by default np.load
    return_utt_id : bool, optional
        Whether to include utterance indentifier in the return value of 
        __getitem__, by default False
    use_cahce : bool, optional
        Whether to cache seen examples while reading, by default False
    """
    def __init__(
            self,
            root_dir: Union[Path, str],
            mel_pattern: str="*-feats.npy",
            mel_length_threshold: Optional[int]=None,
            mel_load_fn: Callable=np.load,
            return_utt_id: bool=False,
            use_cahce: bool=False, ):
        # allow str and Path that contains '~'
        root_dir = Path(root_dir).expanduser()
        # find all of the mel files
        mel_files = sorted(list(root_dir.rglob(mel_pattern)))
        # filter by threshold
        if mel_length_threshold is not None:
            mel_lengths = [mel_load_fn(f).shape[1] for f in mel_files]
            idxs = [
                idx for idx in range(len(mel_files))
                if mel_lengths[idx] > mel_length_threshold
            ]
            if len(mel_files) != len(idxs):
                logging.warning(
                    f"Some files are filtered by mel length threshold "
                    f"({len(mel_files)} -> {len(idxs)}).")
            mel_files = [mel_files[idx] for idx in idxs]
        # assert the number of files
        assert len(mel_files) != 0, f"Not found any mel files in {root_dir}."
        self.mel_files = mel_files
        self.mel_load_fn = mel_load_fn
        # TODO(chenfeiyu): better strategy to get utterance id
        if ".npy" in mel_pattern:
            self.utt_ids = [
                f.name.replace("-feats.npy", "") for f in mel_files
            ]
        else:
            self.utt_ids = [f.stem for f in mel_files]
        self.return_utt_id = return_utt_id
        self.use_cache = use_cahce
        if use_cahce:
            self.manager = Manager()
            self.caches = self.manager.list()
            self.caches += [None for _ in range(len(mel_files))]
    def __getitem__(self, idx):
        """Get an example given the index.
        Parameters
        ----------
        idx : int
            The index
        Returns
        -------
        utt_id : str
            Utterance identifier.
        audio : np.ndarray
            Shape (n_mels, n_frames), the mel spectrogram.
        """
        if self.use_cache and self.caches[idx] is not None:
            return self.caches[idx]
        utt_id = self.utt_ids[idx]
        mel = self.mel_load_fn(self.mel_files[idx])
        if self.return_utt_id:
            items = utt_id, mel
        else:
            items = mel
        if self.use_cache:
            self.caches[idx] = items
        return items
    def __len__(self):
        """Returns the size of the dataset.
        Returns
        -------
        int
            The length of the dataset
        """
        return len(self.mel_files)